| datasets: | |
| - Skywork/Skywork-Reward-Preference-80K-v0.1 | |
| base_model: | |
| - turboderp/Qwama-0.5B-Instruct | |
| lora_rank: 32 | |
| pref_beta: 0.1 | |
| cutoff_len: 2048 | |
| per_device_train_batch_size: 2 | |
| gradient_accumulation_steps: 8 | |
| learning_rate: 5.0e-6 | |
| num_train_epochs: 1.0 | |
| lr_scheduler_type: cosine | |
| warmup_ratio: 0.1 |