|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 20, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 344.25, |
|
"epoch": 0.1, |
|
"grad_norm": 5.403124809265137, |
|
"kl": 0.0, |
|
"learning_rate": 5e-07, |
|
"loss": -0.0, |
|
"reward": 2.3487942395731807, |
|
"reward_std": 1.331467665731907, |
|
"rewards/concensus_correctness_reward_func": 0.6510000005364418, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.46238797600381076, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.46978125628083944, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 364.75, |
|
"epoch": 0.2, |
|
"grad_norm": 6.2670817375183105, |
|
"kl": 0.0021806862114317482, |
|
"learning_rate": 4.864543104251586e-07, |
|
"loss": 0.0, |
|
"reward": 2.635450964793563, |
|
"reward_std": 2.075526769272983, |
|
"rewards/concensus_correctness_reward_func": 0.9166250005364418, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.4957009544596076, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4731249902397394, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 404.71875, |
|
"epoch": 0.3, |
|
"grad_norm": 5.5379438400268555, |
|
"kl": 0.0030734814645256847, |
|
"learning_rate": 4.472851273490984e-07, |
|
"loss": 0.0, |
|
"reward": 2.522380673326552, |
|
"reward_std": 2.0315658951876685, |
|
"rewards/concensus_correctness_reward_func": 0.895562507212162, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5827869249042124, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.6065312507562339, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 354.0, |
|
"epoch": 0.4, |
|
"grad_norm": 6.651269435882568, |
|
"kl": 0.003421090141273453, |
|
"learning_rate": 3.867370395306068e-07, |
|
"loss": 0.0, |
|
"reward": 2.4581775926053524, |
|
"reward_std": 1.335879288148135, |
|
"rewards/concensus_correctness_reward_func": 0.6216250024735928, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.4378338464302942, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.7112187594175339, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 300.0, |
|
"epoch": 0.5, |
|
"grad_norm": 6.796236038208008, |
|
"kl": 0.004635282557501341, |
|
"learning_rate": 3.1137137178519977e-07, |
|
"loss": 0.0, |
|
"reward": 2.6668134666979313, |
|
"reward_std": 1.702367587480694, |
|
"rewards/concensus_correctness_reward_func": 0.6051250025629997, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.61225092317909, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5744375269860029, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 246.28125, |
|
"epoch": 0.6, |
|
"grad_norm": 4.856728553771973, |
|
"kl": 0.006234622225747444, |
|
"learning_rate": 2.2935516363191693e-07, |
|
"loss": 0.0, |
|
"reward": 1.907209224998951, |
|
"reward_std": 1.2923782205325551, |
|
"rewards/concensus_correctness_reward_func": 0.2936249990016222, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.48558421013876796, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.7530000098049641, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 277.84375, |
|
"epoch": 0.7, |
|
"grad_norm": 38.3473014831543, |
|
"kl": 0.005429227188869845, |
|
"learning_rate": 1.4957614383675767e-07, |
|
"loss": 0.0, |
|
"reward": 2.3881534561514854, |
|
"reward_std": 1.1518572303466499, |
|
"rewards/concensus_correctness_reward_func": 0.43924999982118607, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.557153444038704, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.7667500115931034, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 289.0, |
|
"epoch": 0.8, |
|
"grad_norm": 8.100913047790527, |
|
"kl": 0.006314124533673748, |
|
"learning_rate": 8.067960709356478e-08, |
|
"loss": 0.0, |
|
"reward": 2.4665270633995533, |
|
"reward_std": 1.5265621307771653, |
|
"rewards/concensus_correctness_reward_func": 0.5556250028312206, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5798395671881735, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.6435625050216913, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 378.90625, |
|
"epoch": 0.9, |
|
"grad_norm": 4.619319915771484, |
|
"kl": 0.0059225473523838446, |
|
"learning_rate": 3.013156219837776e-08, |
|
"loss": 0.0, |
|
"reward": 2.774918533861637, |
|
"reward_std": 1.6789401592686772, |
|
"rewards/concensus_correctness_reward_func": 0.5491875000298023, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.5945435594767332, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.8186875078827143, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 296.0625, |
|
"epoch": 1.0, |
|
"grad_norm": 6.017106056213379, |
|
"kl": 0.006086832385335583, |
|
"learning_rate": 3.4096741493194193e-09, |
|
"loss": 0.0, |
|
"reward": 2.033856872469187, |
|
"reward_std": 1.216237832326442, |
|
"rewards/concensus_correctness_reward_func": 0.262687498703599, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5571068990975618, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.7765624914318323, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 20, |
|
"total_flos": 0.0, |
|
"train_loss": 4.321709275245666e-06, |
|
"train_runtime": 239.6333, |
|
"train_samples_per_second": 1.335, |
|
"train_steps_per_second": 0.083 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 20, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|