metadata
datasets:
- Skywork/Skywork-Reward-Preference-80K-v0.1
base_model:
- turboderp/Qwama-0.5B-Instruct
lora_rank: 32
pref_beta: 0.1
cutoff_len: 2048
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 5.0e-6
num_train_epochs: 1.0
lr_scheduler_type: cosine
warmup_ratio: 0.1