--- datasets: - Skywork/Skywork-Reward-Preference-80K-v0.1 base_model: - turboderp/Qwama-0.5B-Instruct --- lora_rank: 32 pref_beta: 0.1 cutoff_len: 2048 per_device_train_batch_size: 2 gradient_accumulation_steps: 8 learning_rate: 5.0e-6 num_train_epochs: 1.0 lr_scheduler_type: cosine warmup_ratio: 0.1