|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 747, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 159.95703125, |
|
"epoch": 0.020080321285140562, |
|
"grad_norm": 0.2833329439163208, |
|
"kl": 0.00034067649394273756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.32109375, |
|
"reward_std": 0.2864044725894928, |
|
"rewards/acc_reward_func": 0.32109375, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 179.0203125, |
|
"epoch": 0.040160642570281124, |
|
"grad_norm": 0.47351399064064026, |
|
"kl": 0.000714331166818738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.30625, |
|
"reward_std": 0.2533454954624176, |
|
"rewards/acc_reward_func": 0.30625, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 163.7078125, |
|
"epoch": 0.060240963855421686, |
|
"grad_norm": 0.6223832368850708, |
|
"kl": 0.0008577127242460847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.290625, |
|
"reward_std": 0.26476518511772157, |
|
"rewards/acc_reward_func": 0.290625, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 177.18203125, |
|
"epoch": 0.08032128514056225, |
|
"grad_norm": 0.3723163902759552, |
|
"kl": 0.000897675973828882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33125, |
|
"reward_std": 0.27346172034740446, |
|
"rewards/acc_reward_func": 0.33125, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 172.44921875, |
|
"epoch": 0.10040160642570281, |
|
"grad_norm": 0.4859201908111572, |
|
"kl": 0.0007807362941093743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.35625, |
|
"reward_std": 0.28146005868911744, |
|
"rewards/acc_reward_func": 0.35625, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 173.3265625, |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 0.4038971960544586, |
|
"kl": 0.0007229511742480099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31640625, |
|
"reward_std": 0.2735124319791794, |
|
"rewards/acc_reward_func": 0.31640625, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 168.64296875, |
|
"epoch": 0.14056224899598393, |
|
"grad_norm": 0.3058616816997528, |
|
"kl": 0.000607735151425004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3265625, |
|
"reward_std": 0.24166617095470427, |
|
"rewards/acc_reward_func": 0.3265625, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 162.60703125, |
|
"epoch": 0.1606425702811245, |
|
"grad_norm": 0.2520759701728821, |
|
"kl": 0.0005803823471069336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.309375, |
|
"reward_std": 0.2683109283447266, |
|
"rewards/acc_reward_func": 0.309375, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 167.0765625, |
|
"epoch": 0.18072289156626506, |
|
"grad_norm": 0.3606955409049988, |
|
"kl": 0.00041882623336277904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.3390625, |
|
"reward_std": 0.270548814535141, |
|
"rewards/acc_reward_func": 0.3390625, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 173.20859375, |
|
"epoch": 0.20080321285140562, |
|
"grad_norm": 0.43256309628486633, |
|
"kl": 0.0004548234341200441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.3125, |
|
"reward_std": 0.24177098274230957, |
|
"rewards/acc_reward_func": 0.3125, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 168.24296875, |
|
"epoch": 0.22088353413654618, |
|
"grad_norm": 0.5047885775566101, |
|
"kl": 0.0004514524363912642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.35, |
|
"reward_std": 0.27911658585071564, |
|
"rewards/acc_reward_func": 0.35, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 168.32109375, |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 0.2552633285522461, |
|
"kl": 0.00045427558943629266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.31015625, |
|
"reward_std": 0.27751815915107725, |
|
"rewards/acc_reward_func": 0.31015625, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 173.72421875, |
|
"epoch": 0.26104417670682734, |
|
"grad_norm": 0.2749291956424713, |
|
"kl": 0.0004708627995569259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.36171875, |
|
"reward_std": 0.2908409178256989, |
|
"rewards/acc_reward_func": 0.36171875, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 174.65390625, |
|
"epoch": 0.28112449799196787, |
|
"grad_norm": 0.26198047399520874, |
|
"kl": 0.000500894442666322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.29296875, |
|
"reward_std": 0.2530863583087921, |
|
"rewards/acc_reward_func": 0.29296875, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 168.96953125, |
|
"epoch": 0.30120481927710846, |
|
"grad_norm": 0.34051886200904846, |
|
"kl": 0.0004913369542919099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.34453125, |
|
"reward_std": 0.2887667536735535, |
|
"rewards/acc_reward_func": 0.34453125, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 166.57109375, |
|
"epoch": 0.321285140562249, |
|
"grad_norm": 0.2959311306476593, |
|
"kl": 0.0004982782644219697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.328125, |
|
"reward_std": 0.25415654480457306, |
|
"rewards/acc_reward_func": 0.328125, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 169.40703125, |
|
"epoch": 0.3413654618473896, |
|
"grad_norm": 0.26805025339126587, |
|
"kl": 0.0005323876044712961, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.36796875, |
|
"reward_std": 0.2889047384262085, |
|
"rewards/acc_reward_func": 0.36796875, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 167.22890625, |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 0.3810369074344635, |
|
"kl": 0.0004968916124198586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.36015625, |
|
"reward_std": 0.2771797090768814, |
|
"rewards/acc_reward_func": 0.36015625, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 175.2796875, |
|
"epoch": 0.3815261044176707, |
|
"grad_norm": 0.28440332412719727, |
|
"kl": 0.0004675893171224743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.31484375, |
|
"reward_std": 0.27064948081970214, |
|
"rewards/acc_reward_func": 0.31484375, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 163.48203125, |
|
"epoch": 0.40160642570281124, |
|
"grad_norm": 0.3027651309967041, |
|
"kl": 0.0004759302770253271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.3125, |
|
"reward_std": 0.2688873440027237, |
|
"rewards/acc_reward_func": 0.3125, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 175.22734375, |
|
"epoch": 0.42168674698795183, |
|
"grad_norm": 0.5133289098739624, |
|
"kl": 0.0004573037032969296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.31015625, |
|
"reward_std": 0.2805974006652832, |
|
"rewards/acc_reward_func": 0.31015625, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 167.0984375, |
|
"epoch": 0.44176706827309237, |
|
"grad_norm": 0.3459770381450653, |
|
"kl": 0.0005908279563300312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33359375, |
|
"reward_std": 0.2592057645320892, |
|
"rewards/acc_reward_func": 0.33359375, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 167.68671875, |
|
"epoch": 0.46184738955823296, |
|
"grad_norm": 0.4075464606285095, |
|
"kl": 0.0006865185336209833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.38203125, |
|
"reward_std": 0.265577495098114, |
|
"rewards/acc_reward_func": 0.38203125, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 163.8390625, |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 0.37416180968284607, |
|
"kl": 0.000659433496184647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.315625, |
|
"reward_std": 0.2419831484556198, |
|
"rewards/acc_reward_func": 0.315625, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 167.946875, |
|
"epoch": 0.5020080321285141, |
|
"grad_norm": 0.3058314025402069, |
|
"kl": 0.0005173740908503532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.32421875, |
|
"reward_std": 0.2936626195907593, |
|
"rewards/acc_reward_func": 0.32421875, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 164.896875, |
|
"epoch": 0.5220883534136547, |
|
"grad_norm": 0.3665063977241516, |
|
"kl": 0.0005303317215293646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.346875, |
|
"reward_std": 0.28919236958026884, |
|
"rewards/acc_reward_func": 0.346875, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 169.23515625, |
|
"epoch": 0.5421686746987951, |
|
"grad_norm": 0.38457340002059937, |
|
"kl": 0.0005899963027331979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3453125, |
|
"reward_std": 0.25805322229862215, |
|
"rewards/acc_reward_func": 0.3453125, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 164.45390625, |
|
"epoch": 0.5622489959839357, |
|
"grad_norm": 0.2263726145029068, |
|
"kl": 0.0005681793205440045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.27890625, |
|
"reward_std": 0.23708621561527252, |
|
"rewards/acc_reward_func": 0.27890625, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 167.23046875, |
|
"epoch": 0.5823293172690763, |
|
"grad_norm": 0.28835389018058777, |
|
"kl": 0.0005865491810254752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.340625, |
|
"reward_std": 0.2691764384508133, |
|
"rewards/acc_reward_func": 0.340625, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 160.4046875, |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 0.2847937345504761, |
|
"kl": 0.0006729792105033994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3046875, |
|
"reward_std": 0.2551824957132339, |
|
"rewards/acc_reward_func": 0.3046875, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 160.740625, |
|
"epoch": 0.6224899598393574, |
|
"grad_norm": 0.41062450408935547, |
|
"kl": 0.0006228600163012743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3265625, |
|
"reward_std": 0.24656105935573577, |
|
"rewards/acc_reward_func": 0.3265625, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 168.340625, |
|
"epoch": 0.642570281124498, |
|
"grad_norm": 0.33770281076431274, |
|
"kl": 0.0009516201331280172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3609375, |
|
"reward_std": 0.2608398377895355, |
|
"rewards/acc_reward_func": 0.3609375, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 161.2953125, |
|
"epoch": 0.6626506024096386, |
|
"grad_norm": 0.3424857556819916, |
|
"kl": 0.0007472435943782329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3734375, |
|
"reward_std": 0.292040029168129, |
|
"rewards/acc_reward_func": 0.3734375, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 173.7203125, |
|
"epoch": 0.6827309236947792, |
|
"grad_norm": 0.24203689396381378, |
|
"kl": 0.0007647084421478212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.28203125, |
|
"reward_std": 0.2562589019536972, |
|
"rewards/acc_reward_func": 0.28203125, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 167.025, |
|
"epoch": 0.7028112449799196, |
|
"grad_norm": 0.34411001205444336, |
|
"kl": 0.000770233990624547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.32578125, |
|
"reward_std": 0.25892365276813506, |
|
"rewards/acc_reward_func": 0.32578125, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 172.24453125, |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 0.29481959342956543, |
|
"kl": 0.0009015992400236428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.340625, |
|
"reward_std": 0.25950155556201937, |
|
"rewards/acc_reward_func": 0.340625, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 166.625, |
|
"epoch": 0.7429718875502008, |
|
"grad_norm": 0.2277025729417801, |
|
"kl": 0.00077429274097085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.334375, |
|
"reward_std": 0.2525084614753723, |
|
"rewards/acc_reward_func": 0.334375, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 159.42734375, |
|
"epoch": 0.7630522088353414, |
|
"grad_norm": 0.32006603479385376, |
|
"kl": 0.0008042196277529001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.353125, |
|
"reward_std": 0.28777270913124087, |
|
"rewards/acc_reward_func": 0.353125, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 164.315625, |
|
"epoch": 0.7831325301204819, |
|
"grad_norm": 0.42280659079551697, |
|
"kl": 0.0008120649959892035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.34140625, |
|
"reward_std": 0.2515306770801544, |
|
"rewards/acc_reward_func": 0.34140625, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 163.465625, |
|
"epoch": 0.8032128514056225, |
|
"grad_norm": 0.3453792333602905, |
|
"kl": 0.0007391226128675044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33984375, |
|
"reward_std": 0.2593150854110718, |
|
"rewards/acc_reward_func": 0.33984375, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 160.57109375, |
|
"epoch": 0.8232931726907631, |
|
"grad_norm": 0.22764600813388824, |
|
"kl": 0.0009074539528228342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33359375, |
|
"reward_std": 0.24634140729904175, |
|
"rewards/acc_reward_func": 0.33359375, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 173.69453125, |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 0.3373042941093445, |
|
"kl": 0.0008413837174884975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.32734375, |
|
"reward_std": 0.2701482236385345, |
|
"rewards/acc_reward_func": 0.32734375, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 163.95546875, |
|
"epoch": 0.8634538152610441, |
|
"grad_norm": 0.43492230772972107, |
|
"kl": 0.0008612593519501388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.35546875, |
|
"reward_std": 0.25468774139881134, |
|
"rewards/acc_reward_func": 0.35546875, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 165.01953125, |
|
"epoch": 0.8835341365461847, |
|
"grad_norm": 0.47059279680252075, |
|
"kl": 0.0007435820298269391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3390625, |
|
"reward_std": 0.26499341428279877, |
|
"rewards/acc_reward_func": 0.3390625, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 169.05078125, |
|
"epoch": 0.9036144578313253, |
|
"grad_norm": 0.22417674958705902, |
|
"kl": 0.0007649007253348828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.284375, |
|
"reward_std": 0.2533974200487137, |
|
"rewards/acc_reward_func": 0.284375, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 173.67109375, |
|
"epoch": 0.9236947791164659, |
|
"grad_norm": 0.2978118658065796, |
|
"kl": 0.0007940458599478006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.2984375, |
|
"reward_std": 0.24650274217128754, |
|
"rewards/acc_reward_func": 0.2984375, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 167.99375, |
|
"epoch": 0.9437751004016064, |
|
"grad_norm": 0.2792234420776367, |
|
"kl": 0.0010552789666689933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.35, |
|
"reward_std": 0.26749635934829713, |
|
"rewards/acc_reward_func": 0.35, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 171.94609375, |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.2678660452365875, |
|
"kl": 0.0006605981849133969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31875, |
|
"reward_std": 0.2612347215414047, |
|
"rewards/acc_reward_func": 0.31875, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 160.3421875, |
|
"epoch": 0.9839357429718876, |
|
"grad_norm": 0.6757539510726929, |
|
"kl": 0.0006685945438221097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33984375, |
|
"reward_std": 0.2489775002002716, |
|
"rewards/acc_reward_func": 0.33984375, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 159.35982360839844, |
|
"epoch": 1.0040160642570282, |
|
"grad_norm": 0.4328139126300812, |
|
"kl": 0.0009481518063694239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3328125, |
|
"reward_std": 0.27164973616600036, |
|
"rewards/acc_reward_func": 0.3328125, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 170.5609375, |
|
"epoch": 1.0240963855421688, |
|
"grad_norm": 0.40751489996910095, |
|
"kl": 0.0014292935142293573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.34296875, |
|
"reward_std": 0.26451934576034547, |
|
"rewards/acc_reward_func": 0.34296875, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 158.06484375, |
|
"epoch": 1.0441767068273093, |
|
"grad_norm": 0.32165759801864624, |
|
"kl": 0.0023219846189022064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.31796875, |
|
"reward_std": 0.26662840247154235, |
|
"rewards/acc_reward_func": 0.31796875, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 162.784375, |
|
"epoch": 1.0642570281124497, |
|
"grad_norm": 0.4819350838661194, |
|
"kl": 0.0021671449765563013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3671875, |
|
"reward_std": 0.2797468721866608, |
|
"rewards/acc_reward_func": 0.3671875, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 174.1890625, |
|
"epoch": 1.0843373493975903, |
|
"grad_norm": 0.2753521502017975, |
|
"kl": 0.0023205589037388562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.33984375, |
|
"reward_std": 0.2735643357038498, |
|
"rewards/acc_reward_func": 0.33984375, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 173.61640625, |
|
"epoch": 1.104417670682731, |
|
"grad_norm": 0.2579421103000641, |
|
"kl": 0.0014010543003678323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.2578125, |
|
"reward_std": 0.2305402159690857, |
|
"rewards/acc_reward_func": 0.2578125, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 168.61640625, |
|
"epoch": 1.1244979919678715, |
|
"grad_norm": 0.3602357506752014, |
|
"kl": 0.0012595997890457512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.359375, |
|
"reward_std": 0.3013946235179901, |
|
"rewards/acc_reward_func": 0.359375, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 171.4640625, |
|
"epoch": 1.144578313253012, |
|
"grad_norm": 0.2711893320083618, |
|
"kl": 0.0009470222401432693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.35390625, |
|
"reward_std": 0.27838287949562074, |
|
"rewards/acc_reward_func": 0.35390625, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 172.66015625, |
|
"epoch": 1.1646586345381527, |
|
"grad_norm": 0.2950332760810852, |
|
"kl": 0.0009261242463253439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33359375, |
|
"reward_std": 0.28246039152145386, |
|
"rewards/acc_reward_func": 0.33359375, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 174.0375, |
|
"epoch": 1.1847389558232932, |
|
"grad_norm": 0.42756450176239014, |
|
"kl": 0.0008738423697650432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3171875, |
|
"reward_std": 0.268758499622345, |
|
"rewards/acc_reward_func": 0.3171875, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 163.82265625, |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 0.4912799000740051, |
|
"kl": 0.001153232657816261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33359375, |
|
"reward_std": 0.2731469988822937, |
|
"rewards/acc_reward_func": 0.33359375, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 158.7875, |
|
"epoch": 1.2248995983935742, |
|
"grad_norm": 0.30150240659713745, |
|
"kl": 0.0008952352683991194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3546875, |
|
"reward_std": 0.27324608862400057, |
|
"rewards/acc_reward_func": 0.3546875, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 176.72421875, |
|
"epoch": 1.2449799196787148, |
|
"grad_norm": 0.35327720642089844, |
|
"kl": 0.0011502272100187838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.35625, |
|
"reward_std": 0.27567465901374816, |
|
"rewards/acc_reward_func": 0.35625, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 166.778125, |
|
"epoch": 1.2650602409638554, |
|
"grad_norm": 0.3498896360397339, |
|
"kl": 0.0016141865635290742, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.30703125, |
|
"reward_std": 0.261915448307991, |
|
"rewards/acc_reward_func": 0.30703125, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 178.128125, |
|
"epoch": 1.285140562248996, |
|
"grad_norm": 0.297959566116333, |
|
"kl": 0.00151687542675063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.32421875, |
|
"reward_std": 0.27393734753131865, |
|
"rewards/acc_reward_func": 0.32421875, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 170.6953125, |
|
"epoch": 1.3052208835341366, |
|
"grad_norm": 0.365997850894928, |
|
"kl": 0.0010963335167616605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.315625, |
|
"reward_std": 0.2619461864233017, |
|
"rewards/acc_reward_func": 0.315625, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 166.228125, |
|
"epoch": 1.3253012048192772, |
|
"grad_norm": 0.36575648188591003, |
|
"kl": 0.0010827683610841632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31015625, |
|
"reward_std": 0.26013057231903075, |
|
"rewards/acc_reward_func": 0.31015625, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 166.16484375, |
|
"epoch": 1.3453815261044177, |
|
"grad_norm": 0.29126739501953125, |
|
"kl": 0.0009882883401587605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.309375, |
|
"reward_std": 0.25576087832450867, |
|
"rewards/acc_reward_func": 0.309375, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 173.39453125, |
|
"epoch": 1.3654618473895583, |
|
"grad_norm": 0.33560770750045776, |
|
"kl": 0.0011842920910567045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31953125, |
|
"reward_std": 0.2704659789800644, |
|
"rewards/acc_reward_func": 0.31953125, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 173.82578125, |
|
"epoch": 1.3855421686746987, |
|
"grad_norm": 0.2982430160045624, |
|
"kl": 0.0015789813129231333, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.34453125, |
|
"reward_std": 0.3086081326007843, |
|
"rewards/acc_reward_func": 0.34453125, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 162.96953125, |
|
"epoch": 1.4056224899598393, |
|
"grad_norm": 0.5575153827667236, |
|
"kl": 0.001101066661067307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3765625, |
|
"reward_std": 0.29579696655273435, |
|
"rewards/acc_reward_func": 0.3765625, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 181.57578125, |
|
"epoch": 1.4257028112449799, |
|
"grad_norm": 0.5113179683685303, |
|
"kl": 0.001085875742137432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.29609375, |
|
"reward_std": 0.26413264572620393, |
|
"rewards/acc_reward_func": 0.29609375, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 174.1046875, |
|
"epoch": 1.4457831325301205, |
|
"grad_norm": 0.47564586997032166, |
|
"kl": 0.0010274604661390184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.32890625, |
|
"reward_std": 0.2666553735733032, |
|
"rewards/acc_reward_func": 0.32890625, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 164.609375, |
|
"epoch": 1.465863453815261, |
|
"grad_norm": 0.23254898190498352, |
|
"kl": 0.0011269458453170955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3921875, |
|
"reward_std": 0.26207175552845, |
|
"rewards/acc_reward_func": 0.3921875, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 171.3859375, |
|
"epoch": 1.4859437751004017, |
|
"grad_norm": 0.2955392599105835, |
|
"kl": 0.0013015888049267232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3390625, |
|
"reward_std": 0.2618942677974701, |
|
"rewards/acc_reward_func": 0.3390625, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 170.51328125, |
|
"epoch": 1.5060240963855422, |
|
"grad_norm": 0.2367907017469406, |
|
"kl": 0.0012052926933392883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.30859375, |
|
"reward_std": 0.243873330950737, |
|
"rewards/acc_reward_func": 0.30859375, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 171.328125, |
|
"epoch": 1.5261044176706826, |
|
"grad_norm": 0.4387330412864685, |
|
"kl": 0.0011040043318644166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.34296875, |
|
"reward_std": 0.27786171436309814, |
|
"rewards/acc_reward_func": 0.34296875, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 167.015625, |
|
"epoch": 1.5461847389558234, |
|
"grad_norm": 0.2528681457042694, |
|
"kl": 0.001643406949006021, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.36484375, |
|
"reward_std": 0.3019732892513275, |
|
"rewards/acc_reward_func": 0.36484375, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 150.4078125, |
|
"epoch": 1.5662650602409638, |
|
"grad_norm": 0.35000428557395935, |
|
"kl": 0.0013000907842069865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.378125, |
|
"reward_std": 0.28335249423980713, |
|
"rewards/acc_reward_func": 0.378125, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 158.709375, |
|
"epoch": 1.5863453815261044, |
|
"grad_norm": 0.259275883436203, |
|
"kl": 0.0016851373482495546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.31953125, |
|
"reward_std": 0.28603203892707824, |
|
"rewards/acc_reward_func": 0.31953125, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 160.56796875, |
|
"epoch": 1.606425702811245, |
|
"grad_norm": 0.31947171688079834, |
|
"kl": 0.001047816756181419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3359375, |
|
"reward_std": 0.27272760272026064, |
|
"rewards/acc_reward_func": 0.3359375, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 161.90234375, |
|
"epoch": 1.6265060240963856, |
|
"grad_norm": 0.34025779366493225, |
|
"kl": 0.0011820008745416998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33671875, |
|
"reward_std": 0.2665954947471619, |
|
"rewards/acc_reward_func": 0.33671875, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 167.6984375, |
|
"epoch": 1.6465863453815262, |
|
"grad_norm": 0.2868824005126953, |
|
"kl": 0.0010516393929719924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3578125, |
|
"reward_std": 0.2612337410449982, |
|
"rewards/acc_reward_func": 0.3578125, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 166.8375, |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.30937594175338745, |
|
"kl": 0.0011767351999878884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.30859375, |
|
"reward_std": 0.2570806533098221, |
|
"rewards/acc_reward_func": 0.30859375, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 170.19765625, |
|
"epoch": 1.6867469879518073, |
|
"grad_norm": 0.25181344151496887, |
|
"kl": 0.0010755043127574026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33125, |
|
"reward_std": 0.22994134724140167, |
|
"rewards/acc_reward_func": 0.33125, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 165.94609375, |
|
"epoch": 1.7068273092369477, |
|
"grad_norm": 0.396030992269516, |
|
"kl": 0.0012019906658679246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.315625, |
|
"reward_std": 0.27038955092430117, |
|
"rewards/acc_reward_func": 0.315625, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 167.24140625, |
|
"epoch": 1.7269076305220885, |
|
"grad_norm": 0.3269684314727783, |
|
"kl": 0.001297155674546957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.32890625, |
|
"reward_std": 0.26612446308135984, |
|
"rewards/acc_reward_func": 0.32890625, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 155.46015625, |
|
"epoch": 1.7469879518072289, |
|
"grad_norm": 0.46771514415740967, |
|
"kl": 0.0014664881862699985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3234375, |
|
"reward_std": 0.24766394197940828, |
|
"rewards/acc_reward_func": 0.3234375, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 164.859375, |
|
"epoch": 1.7670682730923695, |
|
"grad_norm": 0.24466943740844727, |
|
"kl": 0.0012787181185558438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.334375, |
|
"reward_std": 0.2795639634132385, |
|
"rewards/acc_reward_func": 0.334375, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 180.32578125, |
|
"epoch": 1.78714859437751, |
|
"grad_norm": 0.28328680992126465, |
|
"kl": 0.001317713246680796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.29296875, |
|
"reward_std": 0.22741309106349944, |
|
"rewards/acc_reward_func": 0.29296875, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 163.96875, |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 0.27741825580596924, |
|
"kl": 0.0015525751281529666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.32421875, |
|
"reward_std": 0.28004167675971986, |
|
"rewards/acc_reward_func": 0.32421875, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 165.8140625, |
|
"epoch": 1.8273092369477912, |
|
"grad_norm": 0.2740982472896576, |
|
"kl": 0.0013578152284026146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33203125, |
|
"reward_std": 0.2814855635166168, |
|
"rewards/acc_reward_func": 0.33203125, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 164.31171875, |
|
"epoch": 1.8473895582329316, |
|
"grad_norm": 0.4178365468978882, |
|
"kl": 0.0013977297581732272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.36796875, |
|
"reward_std": 0.2695493370294571, |
|
"rewards/acc_reward_func": 0.36796875, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 176.71953125, |
|
"epoch": 1.8674698795180724, |
|
"grad_norm": 0.2551046311855316, |
|
"kl": 0.0013892159098759294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.26328125, |
|
"reward_std": 0.24187707006931305, |
|
"rewards/acc_reward_func": 0.26328125, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 162.015625, |
|
"epoch": 1.8875502008032128, |
|
"grad_norm": 0.23321259021759033, |
|
"kl": 0.0014218664728105068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31328125, |
|
"reward_std": 0.24751116037368776, |
|
"rewards/acc_reward_func": 0.31328125, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 181.08125, |
|
"epoch": 1.9076305220883534, |
|
"grad_norm": 0.26958364248275757, |
|
"kl": 0.0012838932918384672, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31328125, |
|
"reward_std": 0.28040671050548555, |
|
"rewards/acc_reward_func": 0.31328125, |
|
"step": 475 |
|
}, |
|
{ |
|
"completion_length": 162.40390625, |
|
"epoch": 1.927710843373494, |
|
"grad_norm": 0.27885714173316956, |
|
"kl": 0.0011875152122229338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.36484375, |
|
"reward_std": 0.26504404842853546, |
|
"rewards/acc_reward_func": 0.36484375, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 158.46640625, |
|
"epoch": 1.9477911646586346, |
|
"grad_norm": 0.27203765511512756, |
|
"kl": 0.0011519475607201456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.35390625, |
|
"reward_std": 0.25064152777194976, |
|
"rewards/acc_reward_func": 0.35390625, |
|
"step": 485 |
|
}, |
|
{ |
|
"completion_length": 163.49921875, |
|
"epoch": 1.9678714859437751, |
|
"grad_norm": 0.38953107595443726, |
|
"kl": 0.002528494060970843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0003, |
|
"reward": 0.33125, |
|
"reward_std": 0.265842866897583, |
|
"rewards/acc_reward_func": 0.33125, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 160.84140625, |
|
"epoch": 1.9879518072289155, |
|
"grad_norm": 0.268568754196167, |
|
"kl": 0.0014860291033983231, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31953125, |
|
"reward_std": 0.25731430053710935, |
|
"rewards/acc_reward_func": 0.31953125, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 179.959375, |
|
"epoch": 2.0080321285140563, |
|
"grad_norm": 0.3279021680355072, |
|
"kl": 0.0021104462211951613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.34140625, |
|
"reward_std": 0.2936858534812927, |
|
"rewards/acc_reward_func": 0.34140625, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 172.03984375, |
|
"epoch": 2.0281124497991967, |
|
"grad_norm": 0.30037155747413635, |
|
"kl": 0.001345854508690536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.2671875, |
|
"reward_std": 0.23858949542045593, |
|
"rewards/acc_reward_func": 0.2671875, |
|
"step": 505 |
|
}, |
|
{ |
|
"completion_length": 166.13359375, |
|
"epoch": 2.0481927710843375, |
|
"grad_norm": 0.24528227746486664, |
|
"kl": 0.0013967655366286635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.28203125, |
|
"reward_std": 0.2324840843677521, |
|
"rewards/acc_reward_func": 0.28203125, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 167.35703125, |
|
"epoch": 2.068273092369478, |
|
"grad_norm": 0.4017987847328186, |
|
"kl": 0.0014677543425932527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.36484375, |
|
"reward_std": 0.2759640544652939, |
|
"rewards/acc_reward_func": 0.36484375, |
|
"step": 515 |
|
}, |
|
{ |
|
"completion_length": 171.8515625, |
|
"epoch": 2.0883534136546187, |
|
"grad_norm": 0.3457529842853546, |
|
"kl": 0.0014000870054587723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.31875, |
|
"reward_std": 0.2790384829044342, |
|
"rewards/acc_reward_func": 0.31875, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 164.70078125, |
|
"epoch": 2.108433734939759, |
|
"grad_norm": 0.21619907021522522, |
|
"kl": 0.0014295668806880713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3140625, |
|
"reward_std": 0.2750910699367523, |
|
"rewards/acc_reward_func": 0.3140625, |
|
"step": 525 |
|
}, |
|
{ |
|
"completion_length": 169.97265625, |
|
"epoch": 2.1285140562248994, |
|
"grad_norm": 0.31079721450805664, |
|
"kl": 0.0012559856520965695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33125, |
|
"reward_std": 0.24622083306312562, |
|
"rewards/acc_reward_func": 0.33125, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 170.13984375, |
|
"epoch": 2.1485943775100402, |
|
"grad_norm": 0.27532029151916504, |
|
"kl": 0.0015559423482045531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.38359375, |
|
"reward_std": 0.28083202838897703, |
|
"rewards/acc_reward_func": 0.38359375, |
|
"step": 535 |
|
}, |
|
{ |
|
"completion_length": 177.421875, |
|
"epoch": 2.1686746987951806, |
|
"grad_norm": 0.31572937965393066, |
|
"kl": 0.0013410489307716488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.265625, |
|
"reward_std": 0.2276999294757843, |
|
"rewards/acc_reward_func": 0.265625, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 167.60234375, |
|
"epoch": 2.1887550200803214, |
|
"grad_norm": 0.29043304920196533, |
|
"kl": 0.0014236285351216793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3203125, |
|
"reward_std": 0.23548413515090943, |
|
"rewards/acc_reward_func": 0.3203125, |
|
"step": 545 |
|
}, |
|
{ |
|
"completion_length": 166.23671875, |
|
"epoch": 2.208835341365462, |
|
"grad_norm": 0.6554353833198547, |
|
"kl": 0.0018002047901973129, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3734375, |
|
"reward_std": 0.2930103540420532, |
|
"rewards/acc_reward_func": 0.3734375, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 159.91953125, |
|
"epoch": 2.2289156626506026, |
|
"grad_norm": 0.5253407955169678, |
|
"kl": 0.0016718338709324598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3703125, |
|
"reward_std": 0.2658358722925186, |
|
"rewards/acc_reward_func": 0.3703125, |
|
"step": 555 |
|
}, |
|
{ |
|
"completion_length": 172.54296875, |
|
"epoch": 2.248995983935743, |
|
"grad_norm": 0.28389930725097656, |
|
"kl": 0.0020956686232239006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3203125, |
|
"reward_std": 0.2805899143218994, |
|
"rewards/acc_reward_func": 0.3203125, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 166.2515625, |
|
"epoch": 2.2690763052208833, |
|
"grad_norm": 0.27247288823127747, |
|
"kl": 0.0015082385856658221, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3296875, |
|
"reward_std": 0.2336159199476242, |
|
"rewards/acc_reward_func": 0.3296875, |
|
"step": 565 |
|
}, |
|
{ |
|
"completion_length": 157.78359375, |
|
"epoch": 2.289156626506024, |
|
"grad_norm": 0.31251490116119385, |
|
"kl": 0.001438130042515695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3796875, |
|
"reward_std": 0.277830982208252, |
|
"rewards/acc_reward_func": 0.3796875, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 172.43984375, |
|
"epoch": 2.3092369477911645, |
|
"grad_norm": 0.37210726737976074, |
|
"kl": 0.001831929781474173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.296875, |
|
"reward_std": 0.2679367482662201, |
|
"rewards/acc_reward_func": 0.296875, |
|
"step": 575 |
|
}, |
|
{ |
|
"completion_length": 162.16015625, |
|
"epoch": 2.3293172690763053, |
|
"grad_norm": 0.5997582077980042, |
|
"kl": 0.0017479128437116742, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3875, |
|
"reward_std": 0.28567528128623965, |
|
"rewards/acc_reward_func": 0.3875, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 167.84765625, |
|
"epoch": 2.3493975903614457, |
|
"grad_norm": 0.23533369600772858, |
|
"kl": 0.001749329548329115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.31875, |
|
"reward_std": 0.2516347885131836, |
|
"rewards/acc_reward_func": 0.31875, |
|
"step": 585 |
|
}, |
|
{ |
|
"completion_length": 165.9859375, |
|
"epoch": 2.3694779116465865, |
|
"grad_norm": 0.3740471303462982, |
|
"kl": 0.001700690435245633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.31484375, |
|
"reward_std": 0.24785314798355101, |
|
"rewards/acc_reward_func": 0.31484375, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 160.9109375, |
|
"epoch": 2.389558232931727, |
|
"grad_norm": 0.8214355111122131, |
|
"kl": 0.0020116518251597883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.37265625, |
|
"reward_std": 0.2642554700374603, |
|
"rewards/acc_reward_func": 0.37265625, |
|
"step": 595 |
|
}, |
|
{ |
|
"completion_length": 177.66796875, |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 0.48972687125205994, |
|
"kl": 0.0014791298424825072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.2984375, |
|
"reward_std": 0.2614441394805908, |
|
"rewards/acc_reward_func": 0.2984375, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 167.84765625, |
|
"epoch": 2.429718875502008, |
|
"grad_norm": 0.29758477210998535, |
|
"kl": 0.0022905914578586818, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.321875, |
|
"reward_std": 0.2514772891998291, |
|
"rewards/acc_reward_func": 0.321875, |
|
"step": 605 |
|
}, |
|
{ |
|
"completion_length": 160.82265625, |
|
"epoch": 2.4497991967871484, |
|
"grad_norm": 0.3575228750705719, |
|
"kl": 0.0013975306414067746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.34765625, |
|
"reward_std": 0.28261533975601194, |
|
"rewards/acc_reward_func": 0.34765625, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 165.34296875, |
|
"epoch": 2.4698795180722892, |
|
"grad_norm": 0.21417830884456635, |
|
"kl": 0.0015371570363640786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.33671875, |
|
"reward_std": 0.25463504195213316, |
|
"rewards/acc_reward_func": 0.33671875, |
|
"step": 615 |
|
}, |
|
{ |
|
"completion_length": 173.79921875, |
|
"epoch": 2.4899598393574296, |
|
"grad_norm": 0.3487497866153717, |
|
"kl": 0.0016002030344679952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.35078125, |
|
"reward_std": 0.2584144353866577, |
|
"rewards/acc_reward_func": 0.35078125, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 170.08125, |
|
"epoch": 2.5100401606425704, |
|
"grad_norm": 0.34159645438194275, |
|
"kl": 0.0013422498479485512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.38203125, |
|
"reward_std": 0.2949507474899292, |
|
"rewards/acc_reward_func": 0.38203125, |
|
"step": 625 |
|
}, |
|
{ |
|
"completion_length": 163.34765625, |
|
"epoch": 2.5301204819277108, |
|
"grad_norm": 0.44099095463752747, |
|
"kl": 0.0015945957973599433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.34921875, |
|
"reward_std": 0.2757268697023392, |
|
"rewards/acc_reward_func": 0.34921875, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 170.05703125, |
|
"epoch": 2.550200803212851, |
|
"grad_norm": 0.4719444215297699, |
|
"kl": 0.0016027268255129456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.32109375, |
|
"reward_std": 0.2545212864875793, |
|
"rewards/acc_reward_func": 0.32109375, |
|
"step": 635 |
|
}, |
|
{ |
|
"completion_length": 167.87421875, |
|
"epoch": 2.570281124497992, |
|
"grad_norm": 0.34449702501296997, |
|
"kl": 0.001873347139917314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.30390625, |
|
"reward_std": 0.2682854264974594, |
|
"rewards/acc_reward_func": 0.30390625, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 158.384375, |
|
"epoch": 2.5903614457831328, |
|
"grad_norm": 0.35067522525787354, |
|
"kl": 0.0016631773207336665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.4421875, |
|
"reward_std": 0.3065200746059418, |
|
"rewards/acc_reward_func": 0.4421875, |
|
"step": 645 |
|
}, |
|
{ |
|
"completion_length": 171.3046875, |
|
"epoch": 2.610441767068273, |
|
"grad_norm": 0.406903475522995, |
|
"kl": 0.001370012597180903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.32265625, |
|
"reward_std": 0.2770428955554962, |
|
"rewards/acc_reward_func": 0.32265625, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 170.6921875, |
|
"epoch": 2.6305220883534135, |
|
"grad_norm": 0.3024086654186249, |
|
"kl": 0.0018637768691405654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.33828125, |
|
"reward_std": 0.2950285911560059, |
|
"rewards/acc_reward_func": 0.33828125, |
|
"step": 655 |
|
}, |
|
{ |
|
"completion_length": 161.92890625, |
|
"epoch": 2.6506024096385543, |
|
"grad_norm": 0.25132983922958374, |
|
"kl": 0.0013798804953694343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.33671875, |
|
"reward_std": 0.25781014263629914, |
|
"rewards/acc_reward_func": 0.33671875, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 165.38203125, |
|
"epoch": 2.6706827309236947, |
|
"grad_norm": 0.24434784054756165, |
|
"kl": 0.0015540448017418384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.34609375, |
|
"reward_std": 0.2942123174667358, |
|
"rewards/acc_reward_func": 0.34609375, |
|
"step": 665 |
|
}, |
|
{ |
|
"completion_length": 168.7765625, |
|
"epoch": 2.6907630522088355, |
|
"grad_norm": 0.28431424498558044, |
|
"kl": 0.0027118735713884236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0003, |
|
"reward": 0.32734375, |
|
"reward_std": 0.26073101758956907, |
|
"rewards/acc_reward_func": 0.32734375, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 166.51484375, |
|
"epoch": 2.710843373493976, |
|
"grad_norm": 0.2701532542705536, |
|
"kl": 0.0017335619311779737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.37890625, |
|
"reward_std": 0.29145594835281374, |
|
"rewards/acc_reward_func": 0.37890625, |
|
"step": 675 |
|
}, |
|
{ |
|
"completion_length": 172.08359375, |
|
"epoch": 2.7309236947791167, |
|
"grad_norm": 0.25592124462127686, |
|
"kl": 0.001596282934769988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3484375, |
|
"reward_std": 0.3067091882228851, |
|
"rewards/acc_reward_func": 0.3484375, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 169.3625, |
|
"epoch": 2.751004016064257, |
|
"grad_norm": 0.369191437959671, |
|
"kl": 0.0018961878260597587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.35703125, |
|
"reward_std": 0.26675377786159515, |
|
"rewards/acc_reward_func": 0.35703125, |
|
"step": 685 |
|
}, |
|
{ |
|
"completion_length": 167.40546875, |
|
"epoch": 2.7710843373493974, |
|
"grad_norm": 0.3414689600467682, |
|
"kl": 0.001623287471011281, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.34375, |
|
"reward_std": 0.24926708936691283, |
|
"rewards/acc_reward_func": 0.34375, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 164.5421875, |
|
"epoch": 2.791164658634538, |
|
"grad_norm": 0.2602121829986572, |
|
"kl": 0.0020977890817448497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3359375, |
|
"reward_std": 0.2536847472190857, |
|
"rewards/acc_reward_func": 0.3359375, |
|
"step": 695 |
|
}, |
|
{ |
|
"completion_length": 153.6890625, |
|
"epoch": 2.8112449799196786, |
|
"grad_norm": 0.28262779116630554, |
|
"kl": 0.0015694845002144574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.38046875, |
|
"reward_std": 0.273573100566864, |
|
"rewards/acc_reward_func": 0.38046875, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 166.97421875, |
|
"epoch": 2.8313253012048194, |
|
"grad_norm": 0.2816270589828491, |
|
"kl": 0.001585571584291756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.36640625, |
|
"reward_std": 0.30526658296585085, |
|
"rewards/acc_reward_func": 0.36640625, |
|
"step": 705 |
|
}, |
|
{ |
|
"completion_length": 163.82421875, |
|
"epoch": 2.8514056224899598, |
|
"grad_norm": 0.35617348551750183, |
|
"kl": 0.0016123745823279022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.34375, |
|
"reward_std": 0.26365512013435366, |
|
"rewards/acc_reward_func": 0.34375, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 168.128125, |
|
"epoch": 2.8714859437751006, |
|
"grad_norm": 0.20444567501544952, |
|
"kl": 0.001708123623393476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.378125, |
|
"reward_std": 0.27598778903484344, |
|
"rewards/acc_reward_func": 0.378125, |
|
"step": 715 |
|
}, |
|
{ |
|
"completion_length": 158.5953125, |
|
"epoch": 2.891566265060241, |
|
"grad_norm": 0.23954364657402039, |
|
"kl": 0.001798305264674127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.36875, |
|
"reward_std": 0.28262283504009245, |
|
"rewards/acc_reward_func": 0.36875, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 168.59453125, |
|
"epoch": 2.9116465863453813, |
|
"grad_norm": 0.5855829119682312, |
|
"kl": 0.0019487401703372597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.33046875, |
|
"reward_std": 0.26820210814476014, |
|
"rewards/acc_reward_func": 0.33046875, |
|
"step": 725 |
|
}, |
|
{ |
|
"completion_length": 173.31171875, |
|
"epoch": 2.931726907630522, |
|
"grad_norm": 0.4006167948246002, |
|
"kl": 0.0018504543462768198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3234375, |
|
"reward_std": 0.2933495879173279, |
|
"rewards/acc_reward_func": 0.3234375, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 168.96015625, |
|
"epoch": 2.9518072289156625, |
|
"grad_norm": 0.4413718581199646, |
|
"kl": 0.002411281201057136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.34765625, |
|
"reward_std": 0.2572381556034088, |
|
"rewards/acc_reward_func": 0.34765625, |
|
"step": 735 |
|
}, |
|
{ |
|
"completion_length": 172.9421875, |
|
"epoch": 2.9718875502008033, |
|
"grad_norm": 0.3350299596786499, |
|
"kl": 0.002306809718720615, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.32109375, |
|
"reward_std": 0.27035961151123045, |
|
"rewards/acc_reward_func": 0.32109375, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 169.828125, |
|
"epoch": 2.9919678714859437, |
|
"grad_norm": 0.4135463535785675, |
|
"kl": 0.002015371876768768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3859375, |
|
"reward_std": 0.3136798143386841, |
|
"rewards/acc_reward_func": 0.3859375, |
|
"step": 745 |
|
}, |
|
{ |
|
"completion_length": 185.21177673339844, |
|
"epoch": 3.0, |
|
"kl": 0.0030494448728859425, |
|
"reward": 0.267578125, |
|
"reward_std": 0.2742668390274048, |
|
"rewards/acc_reward_func": 0.267578125, |
|
"step": 747, |
|
"total_flos": 0.0, |
|
"train_loss": 0.00012346780326337724, |
|
"train_runtime": 38700.7369, |
|
"train_samples_per_second": 0.616, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 747, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|