|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.8, |
|
"eval_steps": 500, |
|
"global_step": 10, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 812.5, |
|
"completions/mean_length": 647.75, |
|
"completions/mean_terminated_length": 475.3166809082031, |
|
"completions/min_length": 138.5, |
|
"completions/min_terminated_length": 138.5, |
|
"epoch": 1.8, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.236729621887207, |
|
"kl": 0.00027297700398776215, |
|
"learning_rate": 5e-07, |
|
"loss": -0.001, |
|
"num_tokens": 14460.0, |
|
"reward": 0.14074324816465378, |
|
"reward_std": 0.12783230934292078, |
|
"rewards/concensus_correctness_reward_func/mean": 0.07774999737739563, |
|
"rewards/concensus_correctness_reward_func/std": 0.21991021931171417, |
|
"rewards/consensus_reward_func/mean": 0.0, |
|
"rewards/consensus_reward_func/std": 0.0, |
|
"rewards/cumulative_reward_2/mean": 0.0, |
|
"rewards/cumulative_reward_2/std": 0.0, |
|
"rewards/final_correctness_reward_func/mean": 0.0, |
|
"rewards/final_correctness_reward_func/std": 0.0, |
|
"rewards/question_recreation_reward_func/mean": 0.06299325078725815, |
|
"rewards/question_recreation_reward_func/std": 0.018077346496284008, |
|
"rewards/soft_format_reward_func/mean": 0.0, |
|
"rewards/soft_format_reward_func/std": 0.0, |
|
"rewards/strict_format_reward_func/mean": 0.0, |
|
"rewards/strict_format_reward_func/std": 0.0, |
|
"rewards/xmlcount_reward_func/mean": 0.0, |
|
"rewards/xmlcount_reward_func/std": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 754.0, |
|
"completions/mean_length": 397.25, |
|
"completions/mean_terminated_length": 251.08333587646484, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 3.8, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 4.27091646194458, |
|
"kl": 0.0006203544398886152, |
|
"learning_rate": 4.864543104251586e-07, |
|
"loss": 0.0127, |
|
"num_tokens": 24912.0, |
|
"reward": 0.1337380139157176, |
|
"reward_std": 0.1742639576550573, |
|
"rewards/concensus_correctness_reward_func/mean": 0.11999999731779099, |
|
"rewards/concensus_correctness_reward_func/std": 0.33941125869750977, |
|
"rewards/consensus_reward_func/mean": 0.0, |
|
"rewards/consensus_reward_func/std": 0.0, |
|
"rewards/cumulative_reward_2/mean": 0.0, |
|
"rewards/cumulative_reward_2/std": 0.0, |
|
"rewards/final_correctness_reward_func/mean": 0.0, |
|
"rewards/final_correctness_reward_func/std": 0.0, |
|
"rewards/question_recreation_reward_func/mean": 0.013738006353378296, |
|
"rewards/question_recreation_reward_func/std": 0.010505724931135774, |
|
"rewards/soft_format_reward_func/mean": 0.0, |
|
"rewards/soft_format_reward_func/std": 0.0, |
|
"rewards/strict_format_reward_func/mean": 0.0, |
|
"rewards/strict_format_reward_func/std": 0.0, |
|
"rewards/xmlcount_reward_func/mean": 0.0, |
|
"rewards/xmlcount_reward_func/std": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 846.5, |
|
"completions/mean_length": 528.4375, |
|
"completions/mean_terminated_length": 422.8214416503906, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 5.8, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.0706632137298584, |
|
"kl": 0.0008973750846053008, |
|
"learning_rate": 4.472851273490984e-07, |
|
"loss": 0.0403, |
|
"num_tokens": 37463.0, |
|
"reward": 0.12599835265427828, |
|
"reward_std": 0.16678897803649306, |
|
"rewards/concensus_correctness_reward_func/mean": 0.1107499971985817, |
|
"rewards/concensus_correctness_reward_func/std": 0.31324827671051025, |
|
"rewards/consensus_reward_func/mean": 0.0, |
|
"rewards/consensus_reward_func/std": 0.0, |
|
"rewards/cumulative_reward_2/mean": 0.0, |
|
"rewards/cumulative_reward_2/std": 0.0, |
|
"rewards/final_correctness_reward_func/mean": 0.0, |
|
"rewards/final_correctness_reward_func/std": 0.0, |
|
"rewards/question_recreation_reward_func/mean": 0.015248352661728859, |
|
"rewards/question_recreation_reward_func/std": 0.015421947930008173, |
|
"rewards/soft_format_reward_func/mean": 0.0, |
|
"rewards/soft_format_reward_func/std": 0.0, |
|
"rewards/strict_format_reward_func/mean": 0.0, |
|
"rewards/strict_format_reward_func/std": 0.0, |
|
"rewards/xmlcount_reward_func/mean": 0.0, |
|
"rewards/xmlcount_reward_func/std": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 744.5, |
|
"completions/mean_length": 588.75, |
|
"completions/mean_terminated_length": 384.3666687011719, |
|
"completions/min_length": 119.0, |
|
"completions/min_terminated_length": 119.0, |
|
"epoch": 7.8, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.6464202404022217, |
|
"kl": 0.0010676352321752347, |
|
"learning_rate": 3.867370395306068e-07, |
|
"loss": 0.1472, |
|
"num_tokens": 50979.0, |
|
"reward": 0.05322604067623615, |
|
"reward_std": 0.02834410360082984, |
|
"rewards/concensus_correctness_reward_func/mean": 0.0, |
|
"rewards/concensus_correctness_reward_func/std": 0.0, |
|
"rewards/consensus_reward_func/mean": 0.0, |
|
"rewards/consensus_reward_func/std": 0.0, |
|
"rewards/cumulative_reward_2/mean": 0.0, |
|
"rewards/cumulative_reward_2/std": 0.0, |
|
"rewards/final_correctness_reward_func/mean": 0.0, |
|
"rewards/final_correctness_reward_func/std": 0.0, |
|
"rewards/question_recreation_reward_func/mean": 0.05322604067623615, |
|
"rewards/question_recreation_reward_func/std": 0.03214400727301836, |
|
"rewards/soft_format_reward_func/mean": 0.0, |
|
"rewards/soft_format_reward_func/std": 0.0, |
|
"rewards/strict_format_reward_func/mean": 0.0, |
|
"rewards/strict_format_reward_func/std": 0.0, |
|
"rewards/xmlcount_reward_func/mean": 0.0, |
|
"rewards/xmlcount_reward_func/std": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 817.5, |
|
"completions/mean_length": 669.4375, |
|
"completions/mean_terminated_length": 470.0, |
|
"completions/min_length": 148.5, |
|
"completions/min_terminated_length": 148.5, |
|
"epoch": 9.8, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.48903226852417, |
|
"kl": 0.0014760679550818168, |
|
"learning_rate": 3.1137137178519977e-07, |
|
"loss": 0.2054, |
|
"num_tokens": 65786.0, |
|
"reward": 0.011943170800805092, |
|
"reward_std": 0.0064218416810035706, |
|
"rewards/concensus_correctness_reward_func/mean": 0.0, |
|
"rewards/concensus_correctness_reward_func/std": 0.0, |
|
"rewards/consensus_reward_func/mean": 0.0, |
|
"rewards/consensus_reward_func/std": 0.0, |
|
"rewards/cumulative_reward_2/mean": 0.0, |
|
"rewards/cumulative_reward_2/std": 0.0, |
|
"rewards/final_correctness_reward_func/mean": 0.0, |
|
"rewards/final_correctness_reward_func/std": 0.0, |
|
"rewards/question_recreation_reward_func/mean": 0.011943170800805092, |
|
"rewards/question_recreation_reward_func/std": 0.0063162968726828694, |
|
"rewards/soft_format_reward_func/mean": 0.0, |
|
"rewards/soft_format_reward_func/std": 0.0, |
|
"rewards/strict_format_reward_func/mean": 0.0, |
|
"rewards/strict_format_reward_func/std": 0.0, |
|
"rewards/xmlcount_reward_func/mean": 0.0, |
|
"rewards/xmlcount_reward_func/std": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"step": 10, |
|
"total_flos": 0.0, |
|
"train_loss": 0.08094322010874748, |
|
"train_runtime": 956.1563, |
|
"train_samples_per_second": 0.167, |
|
"train_steps_per_second": 0.021 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 20, |
|
"num_input_tokens_seen": 65786, |
|
"num_train_epochs": 10, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|