| { | |
| "adapter_path": "jumbo_adapters", | |
| "batch_size": 16, | |
| "config": null, | |
| "data": "data/", | |
| "fine_tune_type": "lora", | |
| "grad_checkpoint": true, | |
| "iters": 3000, | |
| "learning_rate": 5e-05, | |
| "lora_parameters": { | |
| "keys": [ | |
| "mlp.gate_proj", | |
| "mlp.down_proj", | |
| "self_attn.q_proj", | |
| "mlp.up_proj", | |
| "self_attn.v_proj", | |
| "self_attn.k_proj" | |
| ], | |
| "rank": 64, | |
| "alpha": 64, | |
| "dropout": 0.1, | |
| "scale": 16.0 | |
| }, | |
| "lr_schedule": { | |
| "name": "cosine_decay", | |
| "warmup": 500, | |
| "warmup_init": 1e-07, | |
| "arguments": [ | |
| 1e-05, | |
| 500, | |
| 1e-07 | |
| ] | |
| }, | |
| "max_seq_length": 512, | |
| "model": "Qwen/Qwen2.5-3B", | |
| "num_layers": 36, | |
| "resume_adapter_file": null, | |
| "save_every": 50, | |
| "seed": 24, | |
| "steps_per_eval": 50, | |
| "steps_per_report": 10, | |
| "test": true, | |
| "test_batches": 200, | |
| "testfile": "test.jsonl", | |
| "train": true, | |
| "val_batches": 50 | |
| } |