| { |
| "model_config": { |
| "model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit", |
| "use_cache": false, |
| "rope_scaling": { |
| "type": "dynamic", |
| "factor": 2.0 |
| } |
| }, |
| "training_config": { |
| "num_train_epochs": 3, |
| "per_device_train_batch_size": 2, |
| "gradient_accumulation_steps": 4, |
| "learning_rate": 2e-5, |
| "lr_scheduler_type": "cosine", |
| "warmup_ratio": 0.03, |
| "weight_decay": 0.01, |
| "optim": "adamw_torch", |
| "max_grad_norm": 0.3, |
| "max_seq_length": 2048, |
| "logging_steps": 10, |
| "save_steps": 200, |
| "save_total_limit": 3, |
| "evaluation_strategy": "steps", |
| "eval_steps": 200, |
| "load_best_model_at_end": true, |
| "output_dir": "fine_tuned_model", |
| "disable_tqdm": false, |
| "report_to": ["tensorboard"], |
| "logging_first_step": true |
| }, |
| "hardware_config": { |
| "fp16": true, |
| "bf16": false, |
| "gradient_checkpointing": true, |
| "device_map": "auto", |
| "use_flash_attention": false, |
| "attn_implementation": "eager" |
| }, |
| "quantization_config": { |
| "load_in_4bit": true, |
| "bnb_4bit_compute_dtype": "float16", |
| "bnb_4bit_quant_type": "nf4", |
| "bnb_4bit_use_double_quant": true |
| }, |
| "lora_config": { |
| "r": 16, |
| "lora_alpha": 32, |
| "lora_dropout": 0.05, |
| "bias": "none", |
| "target_modules": [ |
| "q_proj", |
| "k_proj", |
| "v_proj", |
| "o_proj", |
| "gate_proj", |
| "up_proj", |
| "down_proj" |
| ] |
| }, |
| "dataset_config": { |
| "sort_by_field": "prompt_number", |
| "sort_direction": "ascending", |
| "max_tokens": 2048, |
| "text_field": "conversations", |
| "shuffle_seed": 42, |
| "training_phase_only": true, |
| "pre_tokenized": true, |
| "input_ids_field": "input_ids", |
| "skip_tokenization": true |
| } |
| } |