|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 20, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 391.84375, |
|
"epoch": 0.1, |
|
"grad_norm": 4.776199817657471, |
|
"kl": 0.0, |
|
"learning_rate": 4.965903258506806e-07, |
|
"loss": 0.0, |
|
"reward": 1.6934495995519683, |
|
"reward_std": 0.8525555054657161, |
|
"rewards/concensus_correctness_reward_func": 0.08275000005960464, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.4939496314327698, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.49174999515525997, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 395.4375, |
|
"epoch": 0.2, |
|
"grad_norm": 9.231148719787598, |
|
"kl": 0.0014984993867983576, |
|
"learning_rate": 4.698684378016222e-07, |
|
"loss": 0.0, |
|
"reward": 2.3257811442017555, |
|
"reward_std": 1.0566792025929317, |
|
"rewards/concensus_correctness_reward_func": 0.35550000332295895, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5732811409980059, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.7719999738037586, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 420.0625, |
|
"epoch": 0.3, |
|
"grad_norm": 20.898244857788086, |
|
"kl": 0.0021197714013396762, |
|
"learning_rate": 4.193203929064353e-07, |
|
"loss": 0.0, |
|
"reward": 1.476895283907652, |
|
"reward_std": 1.1881672106683254, |
|
"rewards/concensus_correctness_reward_func": 0.06462499871850014, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5145827787928283, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5226874835789204, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 505.6875, |
|
"epoch": 0.4, |
|
"grad_norm": 4.914966583251953, |
|
"kl": 0.005482891705469228, |
|
"learning_rate": 3.5042385616324236e-07, |
|
"loss": 0.0, |
|
"reward": 2.152218254748732, |
|
"reward_std": 1.4283872889354825, |
|
"rewards/concensus_correctness_reward_func": 0.49387499690055847, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.5518744704313576, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.48146875761449337, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 425.21875, |
|
"epoch": 0.5, |
|
"grad_norm": 5.341366767883301, |
|
"kl": 0.006879790460516233, |
|
"learning_rate": 2.706448363680831e-07, |
|
"loss": 0.0, |
|
"reward": 2.214655563235283, |
|
"reward_std": 1.2052832515910268, |
|
"rewards/concensus_correctness_reward_func": 0.10068750008940697, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.5675930762663484, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.796374985948205, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 442.5625, |
|
"epoch": 0.6, |
|
"grad_norm": 26.609264373779297, |
|
"kl": 0.009219564337399788, |
|
"learning_rate": 1.886286282148002e-07, |
|
"loss": 0.0, |
|
"reward": 1.690632976591587, |
|
"reward_std": 1.378328304272145, |
|
"rewards/concensus_correctness_reward_func": 0.2481249999254942, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.4131954708136618, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5293124988675117, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 428.0, |
|
"epoch": 0.7, |
|
"grad_norm": 4.339048862457275, |
|
"kl": 0.00990395323606208, |
|
"learning_rate": 1.1326296046939333e-07, |
|
"loss": 0.0, |
|
"reward": 1.9994048587977886, |
|
"reward_std": 1.3680745382444002, |
|
"rewards/concensus_correctness_reward_func": 0.1614375002682209, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.5740923208650202, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.6388749964535236, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 438.03125, |
|
"epoch": 0.8, |
|
"grad_norm": 4.747790813446045, |
|
"kl": 0.007855257164919749, |
|
"learning_rate": 5.271487265090163e-08, |
|
"loss": 0.0, |
|
"reward": 1.9228089675307274, |
|
"reward_std": 0.6404511121800169, |
|
"rewards/concensus_correctness_reward_func": 0.11306249909102917, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.6690589864738286, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.7031874842941761, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 416.875, |
|
"epoch": 0.9, |
|
"grad_norm": 5.46245002746582, |
|
"kl": 0.008813429856672883, |
|
"learning_rate": 1.3545689574841341e-08, |
|
"loss": 0.0, |
|
"reward": 1.7960961200296879, |
|
"reward_std": 1.2154418476857245, |
|
"rewards/concensus_correctness_reward_func": 0.12325000017881393, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.6314085975755006, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.728937485255301, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 418.15625, |
|
"epoch": 1.0, |
|
"grad_norm": 7.303548812866211, |
|
"kl": 0.012376634433167055, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 2.181873619556427, |
|
"reward_std": 1.3524082526564598, |
|
"rewards/concensus_correctness_reward_func": 0.18056249991059303, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.6620298526249826, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5892812423408031, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 20, |
|
"total_flos": 0.0, |
|
"train_loss": 6.427429616451263e-06, |
|
"train_runtime": 417.5088, |
|
"train_samples_per_second": 0.766, |
|
"train_steps_per_second": 0.048 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 20, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|