{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 353.875, "epoch": 0.1, "grad_norm": 5.031754970550537, "kl": 0.0, "learning_rate": 4.965903258506806e-07, "loss": 0.0, "reward": 2.7734319660812616, "reward_std": 2.4145944046613295, "rewards/concensus_correctness_reward_func": 1.25, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.5963382440531859, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5520937647670507, "step": 2 }, { "completion_length": 334.46875, "epoch": 0.2, "grad_norm": 5.746564865112305, "kl": 0.0010456620839249808, "learning_rate": 4.698684378016222e-07, "loss": 0.0, "reward": 1.7992137037217617, "reward_std": 0.7727034210693091, "rewards/concensus_correctness_reward_func": 0.08506250008940697, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5819949674187228, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.6790312509983778, "step": 4 }, { "completion_length": 361.59375, "epoch": 0.3, "grad_norm": 3.3398702144622803, "kl": 0.001574008900206536, "learning_rate": 4.193203929064353e-07, "loss": 0.0, "reward": 2.9710799902677536, "reward_std": 0.6227061338722706, "rewards/concensus_correctness_reward_func": 1.2891874983906746, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.6594237750396132, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.6474687531590462, "step": 6 }, { "completion_length": 319.0625, "epoch": 0.4, "grad_norm": 4.177555561065674, "kl": 0.002741159994911868, "learning_rate": 3.5042385616324236e-07, "loss": 0.0, "reward": 2.2837948873639107, "reward_std": 1.5259181885048747, "rewards/concensus_correctness_reward_func": 0.29250000044703484, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.611263575963676, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.7550312634557486, "step": 8 }, { "completion_length": 315.1875, "epoch": 0.5, "grad_norm": 27.93036651611328, "kl": 0.0033685116504784673, "learning_rate": 2.706448363680831e-07, "loss": 0.0, "reward": 1.7934808060526848, "reward_std": 1.1033665230497718, "rewards/concensus_correctness_reward_func": 0.23343750089406967, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.46682453801622614, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5932187382131815, "step": 10 }, { "completion_length": 294.90625, "epoch": 0.6, "grad_norm": 5.095856666564941, "kl": 0.004714873259217711, "learning_rate": 1.886286282148002e-07, "loss": 0.0, "reward": 1.7133545614778996, "reward_std": 0.9104429350118153, "rewards/concensus_correctness_reward_func": 0.1314375028014183, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5073858371470124, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.7464062590152025, "step": 12 }, { "completion_length": 255.84375, "epoch": 0.7, "grad_norm": 5.774594306945801, "kl": 0.005561068042879924, "learning_rate": 1.1326296046939333e-07, "loss": 0.0, "reward": 2.31035278737545, "reward_std": 0.9512235811562277, "rewards/concensus_correctness_reward_func": 0.10537499934434891, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.6544152703136206, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.9099375084042549, "step": 14 }, { "completion_length": 274.25, "epoch": 0.8, "grad_norm": 4.460197448730469, "kl": 0.006159632313938346, "learning_rate": 5.271487265090163e-08, "loss": 0.0, "reward": 3.217790398746729, "reward_std": 1.8388326059503015, "rewards/concensus_correctness_reward_func": 0.8442499972879887, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.7221341663971543, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.9014062602072954, "step": 16 }, { "completion_length": 301.46875, "epoch": 0.9, "grad_norm": 5.027771472930908, "kl": 0.006903290901391301, "learning_rate": 1.3545689574841341e-08, "loss": 0.0, "reward": 2.116435706615448, "reward_std": 0.9236437288418529, "rewards/concensus_correctness_reward_func": 0.2777499984949827, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.6349669927731156, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.8912187665700912, "step": 18 }, { "completion_length": 251.90625, "epoch": 1.0, "grad_norm": 5.603349208831787, "kl": 0.008129101392114535, "learning_rate": 0.0, "loss": 0.0, "reward": 1.8469502702355385, "reward_std": 0.9816727490979247, "rewards/concensus_correctness_reward_func": 0.1430624984204769, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6068252576515079, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.6595625132322311, "step": 20 }, { "epoch": 1.0, "step": 20, "total_flos": 0.0, "train_loss": 4.049018025398255e-06, "train_runtime": 208.6037, "train_samples_per_second": 1.534, "train_steps_per_second": 0.096 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }