|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9980557355800389, |
|
"eval_steps": 500, |
|
"global_step": 385, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 1848.5458984375, |
|
"epoch": 0.002592352559948153, |
|
"grad_norm": 0.15412024450495956, |
|
"kl": 0.0, |
|
"learning_rate": 2.564102564102564e-08, |
|
"loss": 0.0246, |
|
"reward": 1.4397025108337402, |
|
"reward_std": 0.4701927825808525, |
|
"rewards/accuracy_reward": 0.8418367207050323, |
|
"rewards/improved_len_reward_dast": 0.5978657752275467, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 2130.4540100097656, |
|
"epoch": 0.005184705119896306, |
|
"grad_norm": 0.19408049978062328, |
|
"kl": 0.0, |
|
"learning_rate": 5.128205128205128e-08, |
|
"loss": 0.0596, |
|
"reward": 1.0504228472709656, |
|
"reward_std": 0.31693385541439056, |
|
"rewards/accuracy_reward": 0.6938775479793549, |
|
"rewards/improved_len_reward_dast": 0.3565452881157398, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 2034.2958679199219, |
|
"epoch": 0.007777057679844459, |
|
"grad_norm": 0.1531077683543166, |
|
"kl": 0.0001348257064819336, |
|
"learning_rate": 7.692307692307692e-08, |
|
"loss": -0.0129, |
|
"reward": 1.0101122856140137, |
|
"reward_std": 0.4455054961144924, |
|
"rewards/accuracy_reward": 0.6581632494926453, |
|
"rewards/improved_len_reward_dast": 0.3519490174949169, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 2119.744903564453, |
|
"epoch": 0.010369410239792612, |
|
"grad_norm": 0.1349622041652031, |
|
"kl": 0.00012981891632080078, |
|
"learning_rate": 1.0256410256410256e-07, |
|
"loss": -0.0044, |
|
"reward": 1.2723601460456848, |
|
"reward_std": 0.4871401861310005, |
|
"rewards/accuracy_reward": 0.806122437119484, |
|
"rewards/improved_len_reward_dast": 0.46623772382736206, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 1834.7652893066406, |
|
"epoch": 0.012961762799740765, |
|
"grad_norm": 0.16434839601505108, |
|
"kl": 0.00012123584747314453, |
|
"learning_rate": 1.2820512820512818e-07, |
|
"loss": 0.0443, |
|
"reward": 1.267708569765091, |
|
"reward_std": 0.3166223168373108, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.5024024695158005, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 2152.540740966797, |
|
"epoch": 0.015554115359688918, |
|
"grad_norm": 0.15696438577129812, |
|
"kl": 0.00012969970703125, |
|
"learning_rate": 1.5384615384615385e-07, |
|
"loss": -0.0129, |
|
"reward": 1.0658827871084213, |
|
"reward_std": 0.4334075152873993, |
|
"rewards/accuracy_reward": 0.7142857164144516, |
|
"rewards/improved_len_reward_dast": 0.35159702971577644, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 1747.4591674804688, |
|
"epoch": 0.01814646791963707, |
|
"grad_norm": 0.15893508336342455, |
|
"kl": 0.00010186433792114258, |
|
"learning_rate": 1.7948717948717948e-07, |
|
"loss": 0.0429, |
|
"reward": 1.1448375135660172, |
|
"reward_std": 0.37509680539369583, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.3846333734691143, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 1834.0611572265625, |
|
"epoch": 0.020738820479585224, |
|
"grad_norm": 0.1573166657366275, |
|
"kl": 0.00011396408081054688, |
|
"learning_rate": 2.0512820512820512e-07, |
|
"loss": 0.0036, |
|
"reward": 1.272167608141899, |
|
"reward_std": 0.3015933446586132, |
|
"rewards/accuracy_reward": 0.8010203838348389, |
|
"rewards/improved_len_reward_dast": 0.47114718705415726, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 2077.1122131347656, |
|
"epoch": 0.023331173039533377, |
|
"grad_norm": 0.15123878128380125, |
|
"kl": 0.0001251697540283203, |
|
"learning_rate": 2.3076923076923078e-07, |
|
"loss": 0.0025, |
|
"reward": 1.1346809566020966, |
|
"reward_std": 0.44101474434137344, |
|
"rewards/accuracy_reward": 0.7448979467153549, |
|
"rewards/improved_len_reward_dast": 0.38978295773267746, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 2001.6989135742188, |
|
"epoch": 0.02592352559948153, |
|
"grad_norm": 0.15946517595083978, |
|
"kl": 0.00013494491577148438, |
|
"learning_rate": 2.5641025641025636e-07, |
|
"loss": 0.0414, |
|
"reward": 1.0840217173099518, |
|
"reward_std": 0.37720372527837753, |
|
"rewards/accuracy_reward": 0.7244897931814194, |
|
"rewards/improved_len_reward_dast": 0.3595319651067257, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 2258.3468322753906, |
|
"epoch": 0.028515878159429683, |
|
"grad_norm": 0.16258661653616813, |
|
"kl": 0.0001423358917236328, |
|
"learning_rate": 2.8205128205128203e-07, |
|
"loss": -0.0035, |
|
"reward": 1.035923331975937, |
|
"reward_std": 0.44437722116708755, |
|
"rewards/accuracy_reward": 0.6989795863628387, |
|
"rewards/improved_len_reward_dast": 0.33694368600845337, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 2071.6019897460938, |
|
"epoch": 0.031108230719377836, |
|
"grad_norm": 0.15520698307030686, |
|
"kl": 0.0001367330551147461, |
|
"learning_rate": 3.076923076923077e-07, |
|
"loss": 0.0151, |
|
"reward": 1.1415546834468842, |
|
"reward_std": 0.37767674773931503, |
|
"rewards/accuracy_reward": 0.7653061002492905, |
|
"rewards/improved_len_reward_dast": 0.3762484937906265, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 1976.1530151367188, |
|
"epoch": 0.033700583279325985, |
|
"grad_norm": 0.17189810461087038, |
|
"kl": 0.00012564659118652344, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 0.0019, |
|
"reward": 1.125291794538498, |
|
"reward_std": 0.4003720059990883, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.36508774384856224, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 2114.5612182617188, |
|
"epoch": 0.03629293583927414, |
|
"grad_norm": 0.18307106761606742, |
|
"kl": 0.00011533498764038086, |
|
"learning_rate": 3.5897435897435896e-07, |
|
"loss": 0.0248, |
|
"reward": 1.0526445508003235, |
|
"reward_std": 0.33728349953889847, |
|
"rewards/accuracy_reward": 0.6530612111091614, |
|
"rewards/improved_len_reward_dast": 0.3995833285152912, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 1440.3571166992188, |
|
"epoch": 0.03888528839922229, |
|
"grad_norm": 0.19219239961861387, |
|
"kl": 7.677078247070312e-05, |
|
"learning_rate": 3.8461538461538463e-07, |
|
"loss": 0.0411, |
|
"reward": 1.3660516738891602, |
|
"reward_std": 0.2804589569568634, |
|
"rewards/accuracy_reward": 0.9030611962080002, |
|
"rewards/improved_len_reward_dast": 0.46299050748348236, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 1305.2295684814453, |
|
"epoch": 0.04147764095917045, |
|
"grad_norm": 0.18960595204343547, |
|
"kl": 9.632110595703125e-05, |
|
"learning_rate": 4.1025641025641024e-07, |
|
"loss": 0.0021, |
|
"reward": 1.418413519859314, |
|
"reward_std": 0.44618362933397293, |
|
"rewards/accuracy_reward": 0.9132652878761292, |
|
"rewards/improved_len_reward_dast": 0.5051482394337654, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 1996.841796875, |
|
"epoch": 0.0440699935191186, |
|
"grad_norm": 0.16908596036858053, |
|
"kl": 0.00011074542999267578, |
|
"learning_rate": 4.358974358974359e-07, |
|
"loss": 0.0341, |
|
"reward": 1.1314191222190857, |
|
"reward_std": 0.6118374243378639, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.37121502310037613, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 1431.4846801757812, |
|
"epoch": 0.046662346079066754, |
|
"grad_norm": 0.22735446925703126, |
|
"kl": 8.571147918701172e-05, |
|
"learning_rate": 4.6153846153846156e-07, |
|
"loss": 0.0407, |
|
"reward": 1.206202208995819, |
|
"reward_std": 0.3719758912920952, |
|
"rewards/accuracy_reward": 0.8469387739896774, |
|
"rewards/improved_len_reward_dast": 0.3592635001987219, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 1709.688720703125, |
|
"epoch": 0.0492546986390149, |
|
"grad_norm": 0.18577878700500422, |
|
"kl": 0.00010377168655395508, |
|
"learning_rate": 4.871794871794871e-07, |
|
"loss": 0.0417, |
|
"reward": 1.1775241941213608, |
|
"reward_std": 0.5288017690181732, |
|
"rewards/accuracy_reward": 0.7806122303009033, |
|
"rewards/improved_len_reward_dast": 0.39691203087568283, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 1838.2754821777344, |
|
"epoch": 0.05184705119896306, |
|
"grad_norm": 0.16046849749418657, |
|
"kl": 0.00011777877807617188, |
|
"learning_rate": 5.128205128205127e-07, |
|
"loss": 0.0208, |
|
"reward": 1.1064813733100891, |
|
"reward_std": 0.5807419717311859, |
|
"rewards/accuracy_reward": 0.7551020234823227, |
|
"rewards/improved_len_reward_dast": 0.3513793312013149, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 2217.14794921875, |
|
"epoch": 0.05443940375891121, |
|
"grad_norm": 0.1963426577198746, |
|
"kl": 0.00014448165893554688, |
|
"learning_rate": 5.384615384615384e-07, |
|
"loss": 0.0467, |
|
"reward": 1.0558834075927734, |
|
"reward_std": 0.558340422809124, |
|
"rewards/accuracy_reward": 0.6887754797935486, |
|
"rewards/improved_len_reward_dast": 0.36710788309574127, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 1927.3316040039062, |
|
"epoch": 0.057031756318859365, |
|
"grad_norm": 0.18525325793381328, |
|
"kl": 9.930133819580078e-05, |
|
"learning_rate": 5.641025641025641e-07, |
|
"loss": 0.0242, |
|
"reward": 1.1790167838335037, |
|
"reward_std": 0.4690204933285713, |
|
"rewards/accuracy_reward": 0.7857142835855484, |
|
"rewards/improved_len_reward_dast": 0.39330248534679413, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 1841.6938171386719, |
|
"epoch": 0.059624108878807515, |
|
"grad_norm": 0.17253945143916685, |
|
"kl": 0.00010156631469726562, |
|
"learning_rate": 5.897435897435898e-07, |
|
"loss": 0.0724, |
|
"reward": 1.3324860334396362, |
|
"reward_std": 0.28684910759329796, |
|
"rewards/accuracy_reward": 0.8010203987360001, |
|
"rewards/improved_len_reward_dast": 0.5314656794071198, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 1679.9642333984375, |
|
"epoch": 0.06221646143875567, |
|
"grad_norm": 0.20870673606371046, |
|
"kl": 0.00012958049774169922, |
|
"learning_rate": 6.153846153846154e-07, |
|
"loss": 0.0467, |
|
"reward": 1.1419631987810135, |
|
"reward_std": 0.38880112022161484, |
|
"rewards/accuracy_reward": 0.8010203838348389, |
|
"rewards/improved_len_reward_dast": 0.3409428298473358, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 2278.8673095703125, |
|
"epoch": 0.06480881399870382, |
|
"grad_norm": 0.15316366458717245, |
|
"kl": 0.00015485286712646484, |
|
"learning_rate": 6.410256410256411e-07, |
|
"loss": 0.0203, |
|
"reward": 0.9916537553071976, |
|
"reward_std": 0.43884778022766113, |
|
"rewards/accuracy_reward": 0.6479591578245163, |
|
"rewards/improved_len_reward_dast": 0.3436945825815201, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 1853.4744873046875, |
|
"epoch": 0.06740116655865197, |
|
"grad_norm": 0.1623211083206233, |
|
"kl": 0.0001201629638671875, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 0.054, |
|
"reward": 1.1868394315242767, |
|
"reward_std": 0.4521937184035778, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.42663537338376045, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 1726.6427917480469, |
|
"epoch": 0.06999351911860013, |
|
"grad_norm": 0.21873628771810408, |
|
"kl": 0.0001125335693359375, |
|
"learning_rate": 6.923076923076922e-07, |
|
"loss": 0.086, |
|
"reward": 1.2924230992794037, |
|
"reward_std": 0.41079702973365784, |
|
"rewards/accuracy_reward": 0.8418367356061935, |
|
"rewards/improved_len_reward_dast": 0.45058638602495193, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 1667.6071166992188, |
|
"epoch": 0.07258587167854828, |
|
"grad_norm": 0.18905776966101132, |
|
"kl": 0.00011527538299560547, |
|
"learning_rate": 7.179487179487179e-07, |
|
"loss": 0.045, |
|
"reward": 1.2638164162635803, |
|
"reward_std": 0.2763877250254154, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.4525919631123543, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 2032.4132080078125, |
|
"epoch": 0.07517822423849643, |
|
"grad_norm": 0.15326481666027458, |
|
"kl": 0.00012993812561035156, |
|
"learning_rate": 7.435897435897435e-07, |
|
"loss": 0.0002, |
|
"reward": 1.1888954937458038, |
|
"reward_std": 0.41189244389533997, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.42869146168231964, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 1764.4999389648438, |
|
"epoch": 0.07777057679844458, |
|
"grad_norm": 0.13723640714210214, |
|
"kl": 9.167194366455078e-05, |
|
"learning_rate": 7.692307692307693e-07, |
|
"loss": -0.0066, |
|
"reward": 1.0674456059932709, |
|
"reward_std": 0.4443123862147331, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.2970374431461096, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2198.729522705078, |
|
"epoch": 0.08036292935839275, |
|
"grad_norm": 0.15079546325320037, |
|
"kl": 0.0001614093780517578, |
|
"learning_rate": 7.948717948717948e-07, |
|
"loss": 0.013, |
|
"reward": 1.3089748322963715, |
|
"reward_std": 0.5274734199047089, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.48754626512527466, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 1879.6376647949219, |
|
"epoch": 0.0829552819183409, |
|
"grad_norm": 0.18155740478939822, |
|
"kl": 0.0001251697540283203, |
|
"learning_rate": 8.205128205128205e-07, |
|
"loss": 0.0131, |
|
"reward": 1.0791111141443253, |
|
"reward_std": 0.46941038966178894, |
|
"rewards/accuracy_reward": 0.7346938699483871, |
|
"rewards/improved_len_reward_dast": 0.34441729076206684, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 1981.6274719238281, |
|
"epoch": 0.08554763447828904, |
|
"grad_norm": 0.1572483646834791, |
|
"kl": 0.0001424551010131836, |
|
"learning_rate": 8.461538461538461e-07, |
|
"loss": 0.0476, |
|
"reward": 1.3903695046901703, |
|
"reward_std": 0.4975530132651329, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.5332267209887505, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 2061.9999389648438, |
|
"epoch": 0.0881399870382372, |
|
"grad_norm": 0.1901994694040778, |
|
"kl": 0.0001537799835205078, |
|
"learning_rate": 8.717948717948718e-07, |
|
"loss": 0.0481, |
|
"reward": 1.1052793562412262, |
|
"reward_std": 0.4630768448114395, |
|
"rewards/accuracy_reward": 0.7448979467153549, |
|
"rewards/improved_len_reward_dast": 0.36038143932819366, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 2465.1224060058594, |
|
"epoch": 0.09073233959818536, |
|
"grad_norm": 0.15096654762075654, |
|
"kl": 0.0001761913299560547, |
|
"learning_rate": 8.974358974358974e-07, |
|
"loss": 0.0009, |
|
"reward": 0.7364223003387451, |
|
"reward_std": 0.4229283332824707, |
|
"rewards/accuracy_reward": 0.5357142835855484, |
|
"rewards/improved_len_reward_dast": 0.20070804562419653, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 2199.688720703125, |
|
"epoch": 0.09332469215813351, |
|
"grad_norm": 0.1791438585472734, |
|
"kl": 0.0001895427703857422, |
|
"learning_rate": 9.230769230769231e-07, |
|
"loss": 0.0399, |
|
"reward": 1.2042141258716583, |
|
"reward_std": 0.3516070544719696, |
|
"rewards/accuracy_reward": 0.7755101770162582, |
|
"rewards/improved_len_reward_dast": 0.4287039190530777, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 2019.6478881835938, |
|
"epoch": 0.09591704471808166, |
|
"grad_norm": 0.1921872688604767, |
|
"kl": 0.00020241737365722656, |
|
"learning_rate": 9.487179487179486e-07, |
|
"loss": 0.0187, |
|
"reward": 1.3608680367469788, |
|
"reward_std": 0.4326165243983269, |
|
"rewards/accuracy_reward": 0.8316326439380646, |
|
"rewards/improved_len_reward_dast": 0.5292353481054306, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 1693.0, |
|
"epoch": 0.0985093972780298, |
|
"grad_norm": 0.19045468187511366, |
|
"kl": 0.0001348257064819336, |
|
"learning_rate": 9.743589743589742e-07, |
|
"loss": 0.0464, |
|
"reward": 1.3455627113580704, |
|
"reward_std": 0.3586850240826607, |
|
"rewards/accuracy_reward": 0.846938744187355, |
|
"rewards/improved_len_reward_dast": 0.49862393736839294, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 2374.637725830078, |
|
"epoch": 0.10110174983797797, |
|
"grad_norm": 0.13494398794899917, |
|
"kl": 0.0002028942108154297, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0272, |
|
"reward": 0.8414318859577179, |
|
"reward_std": 0.48852086812257767, |
|
"rewards/accuracy_reward": 0.6224489659070969, |
|
"rewards/improved_len_reward_dast": 0.21898294147104025, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 2517.3162841796875, |
|
"epoch": 0.10369410239792612, |
|
"grad_norm": 0.16744933736297124, |
|
"kl": 0.0002105236053466797, |
|
"learning_rate": 9.99981450718918e-07, |
|
"loss": 0.0616, |
|
"reward": 0.9213714599609375, |
|
"reward_std": 0.43374133110046387, |
|
"rewards/accuracy_reward": 0.6275510042905807, |
|
"rewards/improved_len_reward_dast": 0.2938204384408891, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 1807.0203857421875, |
|
"epoch": 0.10628645495787427, |
|
"grad_norm": 0.15669439739322064, |
|
"kl": 0.0002703666687011719, |
|
"learning_rate": 9.99925804404898e-07, |
|
"loss": 0.0228, |
|
"reward": 0.994490772485733, |
|
"reward_std": 0.5202224850654602, |
|
"rewards/accuracy_reward": 0.7193877547979355, |
|
"rewards/improved_len_reward_dast": 0.27510301768779755, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 1907.0305786132812, |
|
"epoch": 0.10887880751782242, |
|
"grad_norm": 0.1507066292700219, |
|
"kl": 0.00019288063049316406, |
|
"learning_rate": 9.998330656454915e-07, |
|
"loss": 0.0566, |
|
"reward": 1.3084075152873993, |
|
"reward_std": 0.3637009263038635, |
|
"rewards/accuracy_reward": 0.8367346823215485, |
|
"rewards/improved_len_reward_dast": 0.4716728553175926, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 1946.2958984375, |
|
"epoch": 0.11147116007777058, |
|
"grad_norm": 0.21826053334493506, |
|
"kl": 0.0002913475036621094, |
|
"learning_rate": 9.99703242086198e-07, |
|
"loss": 0.0894, |
|
"reward": 1.0715700536966324, |
|
"reward_std": 0.4503963589668274, |
|
"rewards/accuracy_reward": 0.7397958934307098, |
|
"rewards/improved_len_reward_dast": 0.3317741868086159, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 1862.9591674804688, |
|
"epoch": 0.11406351263771873, |
|
"grad_norm": 0.18297677442826724, |
|
"kl": 0.000263214111328125, |
|
"learning_rate": 9.995363444298333e-07, |
|
"loss": 0.037, |
|
"reward": 1.2490134239196777, |
|
"reward_std": 0.4328879788517952, |
|
"rewards/accuracy_reward": 0.7653061076998711, |
|
"rewards/improved_len_reward_dast": 0.4837072864174843, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 2316.530517578125, |
|
"epoch": 0.11665586519766688, |
|
"grad_norm": 0.15141936649503004, |
|
"kl": 0.0003380775451660156, |
|
"learning_rate": 9.993323864356492e-07, |
|
"loss": 0.0182, |
|
"reward": 0.7743872255086899, |
|
"reward_std": 0.55930295586586, |
|
"rewards/accuracy_reward": 0.5765305981040001, |
|
"rewards/improved_len_reward_dast": 0.19785663951188326, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 2924.1683349609375, |
|
"epoch": 0.11924821775761503, |
|
"grad_norm": 0.12614913947783052, |
|
"kl": 0.0002567768096923828, |
|
"learning_rate": 9.990913849181977e-07, |
|
"loss": 0.0096, |
|
"reward": 0.8433035537600517, |
|
"reward_std": 0.41744476184248924, |
|
"rewards/accuracy_reward": 0.5561224333941936, |
|
"rewards/improved_len_reward_dast": 0.28718107007443905, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 1805.5203552246094, |
|
"epoch": 0.1218405703175632, |
|
"grad_norm": 0.15881163011201838, |
|
"kl": 0.0007009506225585938, |
|
"learning_rate": 9.988133597459444e-07, |
|
"loss": 0.0175, |
|
"reward": 1.1679251790046692, |
|
"reward_std": 0.4487800747156143, |
|
"rewards/accuracy_reward": 0.795918345451355, |
|
"rewards/improved_len_reward_dast": 0.3720068037509918, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 1873.7499389648438, |
|
"epoch": 0.12443292287751134, |
|
"grad_norm": 0.1713187626068608, |
|
"kl": 0.00028634071350097656, |
|
"learning_rate": 9.984983338396323e-07, |
|
"loss": 0.0488, |
|
"reward": 1.2101139575242996, |
|
"reward_std": 0.33226554840803146, |
|
"rewards/accuracy_reward": 0.760204091668129, |
|
"rewards/improved_len_reward_dast": 0.44990991055965424, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 1411.4234161376953, |
|
"epoch": 0.1270252754374595, |
|
"grad_norm": 0.18215178056260903, |
|
"kl": 0.0005662441253662109, |
|
"learning_rate": 9.981463331703903e-07, |
|
"loss": 0.0348, |
|
"reward": 1.4565084278583527, |
|
"reward_std": 0.3240164965391159, |
|
"rewards/accuracy_reward": 0.867346927523613, |
|
"rewards/improved_len_reward_dast": 0.5891614705324173, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 1923.6836547851562, |
|
"epoch": 0.12961762799740764, |
|
"grad_norm": 0.21182741369137464, |
|
"kl": 0.00043964385986328125, |
|
"learning_rate": 9.977573867575937e-07, |
|
"loss": 0.0483, |
|
"reward": 1.0672244429588318, |
|
"reward_std": 0.42784378305077553, |
|
"rewards/accuracy_reward": 0.7244897782802582, |
|
"rewards/improved_len_reward_dast": 0.342734657227993, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 2293.10205078125, |
|
"epoch": 0.1322099805573558, |
|
"grad_norm": 0.17784622321620705, |
|
"kl": 0.0005965232849121094, |
|
"learning_rate": 9.9733152666647e-07, |
|
"loss": 0.0011, |
|
"reward": 1.119166985154152, |
|
"reward_std": 0.4692757725715637, |
|
"rewards/accuracy_reward": 0.6836734563112259, |
|
"rewards/improved_len_reward_dast": 0.43549349159002304, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 2606.8468627929688, |
|
"epoch": 0.13480233311730394, |
|
"grad_norm": 0.16188767449887392, |
|
"kl": 0.0004382133483886719, |
|
"learning_rate": 9.968687880054579e-07, |
|
"loss": 0.0355, |
|
"reward": 1.0624671429395676, |
|
"reward_std": 0.5272083953022957, |
|
"rewards/accuracy_reward": 0.6530612111091614, |
|
"rewards/improved_len_reward_dast": 0.4094058535993099, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 1741.494873046875, |
|
"epoch": 0.1373946856772521, |
|
"grad_norm": 0.18163262147540796, |
|
"kl": 0.0007987022399902344, |
|
"learning_rate": 9.963692089233104e-07, |
|
"loss": 0.0189, |
|
"reward": 1.1586879789829254, |
|
"reward_std": 0.3523149788379669, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.3678716644644737, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 1731.5713806152344, |
|
"epoch": 0.13998703823720027, |
|
"grad_norm": 0.17545616003222686, |
|
"kl": 0.000713348388671875, |
|
"learning_rate": 9.958328306059508e-07, |
|
"loss": 0.0163, |
|
"reward": 1.087464839220047, |
|
"reward_std": 0.37970298528671265, |
|
"rewards/accuracy_reward": 0.7499999701976776, |
|
"rewards/improved_len_reward_dast": 0.3374648429453373, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 1940.2244262695312, |
|
"epoch": 0.1425793907971484, |
|
"grad_norm": 0.20829916863603212, |
|
"kl": 0.0008840560913085938, |
|
"learning_rate": 9.952596972730782e-07, |
|
"loss": 0.0418, |
|
"reward": 1.136895164847374, |
|
"reward_std": 0.21965472772717476, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.37158904783427715, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 2024.3825988769531, |
|
"epoch": 0.14517174335709657, |
|
"grad_norm": 0.16061899047482414, |
|
"kl": 0.0006990432739257812, |
|
"learning_rate": 9.946498561745201e-07, |
|
"loss": 0.0061, |
|
"reward": 1.3091870546340942, |
|
"reward_std": 0.42107394337654114, |
|
"rewards/accuracy_reward": 0.8010203987360001, |
|
"rewards/improved_len_reward_dast": 0.50816660374403, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 1990.7856750488281, |
|
"epoch": 0.14776409591704473, |
|
"grad_norm": 0.17205784813401187, |
|
"kl": 0.0008096694946289062, |
|
"learning_rate": 9.94003357586339e-07, |
|
"loss": 0.0362, |
|
"reward": 1.3399446904659271, |
|
"reward_std": 0.34059275686740875, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.5185160860419273, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 2279.331573486328, |
|
"epoch": 0.15035644847699287, |
|
"grad_norm": 0.1637215457597632, |
|
"kl": 0.0006699562072753906, |
|
"learning_rate": 9.933202548066855e-07, |
|
"loss": 0.0424, |
|
"reward": 1.0715169459581375, |
|
"reward_std": 0.39220181107521057, |
|
"rewards/accuracy_reward": 0.6887754946947098, |
|
"rewards/improved_len_reward_dast": 0.38274142518639565, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 2313.2499084472656, |
|
"epoch": 0.15294880103694103, |
|
"grad_norm": 0.16376379786761341, |
|
"kl": 0.00083160400390625, |
|
"learning_rate": 9.926006041514068e-07, |
|
"loss": 0.0178, |
|
"reward": 1.142714947462082, |
|
"reward_std": 0.3937602676451206, |
|
"rewards/accuracy_reward": 0.739795908331871, |
|
"rewards/improved_len_reward_dast": 0.40291906148195267, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 2046.1631774902344, |
|
"epoch": 0.15554115359688916, |
|
"grad_norm": 0.23236942157628335, |
|
"kl": 0.0009450912475585938, |
|
"learning_rate": 9.918444649494012e-07, |
|
"loss": 0.0662, |
|
"reward": 1.245220124721527, |
|
"reward_std": 0.2695602234452963, |
|
"rewards/accuracy_reward": 0.7755101770162582, |
|
"rewards/improved_len_reward_dast": 0.46970994770526886, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 2175.6224060058594, |
|
"epoch": 0.15813350615683733, |
|
"grad_norm": 0.15376927864805173, |
|
"kl": 0.0009765625, |
|
"learning_rate": 9.9105189953773e-07, |
|
"loss": 0.0196, |
|
"reward": 1.2470524311065674, |
|
"reward_std": 0.45635347813367844, |
|
"rewards/accuracy_reward": 0.7653061002492905, |
|
"rewards/improved_len_reward_dast": 0.48174627125263214, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 2337.1581115722656, |
|
"epoch": 0.1607258587167855, |
|
"grad_norm": 0.15218316765828901, |
|
"kl": 0.0008411407470703125, |
|
"learning_rate": 9.90222973256475e-07, |
|
"loss": 0.0249, |
|
"reward": 1.37412428855896, |
|
"reward_std": 0.39829079806804657, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.552695706486702, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 2680.4183349609375, |
|
"epoch": 0.16331821127673363, |
|
"grad_norm": 0.21218309711028285, |
|
"kl": 0.0010118484497070312, |
|
"learning_rate": 9.89357754443355e-07, |
|
"loss": 0.0529, |
|
"reward": 0.8223338723182678, |
|
"reward_std": 0.4073232337832451, |
|
"rewards/accuracy_reward": 0.5510203987360001, |
|
"rewards/improved_len_reward_dast": 0.2713134288787842, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 2635.7550048828125, |
|
"epoch": 0.1659105638366818, |
|
"grad_norm": 0.1620590183136494, |
|
"kl": 0.000949859619140625, |
|
"learning_rate": 9.884563144280897e-07, |
|
"loss": 0.0464, |
|
"reward": 1.0863047987222672, |
|
"reward_std": 0.4714929535984993, |
|
"rewards/accuracy_reward": 0.678571417927742, |
|
"rewards/improved_len_reward_dast": 0.40773337706923485, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 1972.2907104492188, |
|
"epoch": 0.16850291639662995, |
|
"grad_norm": 0.17935605548712222, |
|
"kl": 0.001079559326171875, |
|
"learning_rate": 9.875187275265198e-07, |
|
"loss": 0.0255, |
|
"reward": 1.2364896833896637, |
|
"reward_std": 0.4289153516292572, |
|
"rewards/accuracy_reward": 0.7959183603525162, |
|
"rewards/improved_len_reward_dast": 0.44057128578424454, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 2525.2091064453125, |
|
"epoch": 0.1710952689565781, |
|
"grad_norm": 0.14682421707314297, |
|
"kl": 0.0012102127075195312, |
|
"learning_rate": 9.865450710344807e-07, |
|
"loss": 0.0344, |
|
"reward": 0.8753379732370377, |
|
"reward_std": 0.3238606099039316, |
|
"rewards/accuracy_reward": 0.5918367132544518, |
|
"rewards/improved_len_reward_dast": 0.2835012301802635, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 2308.1478576660156, |
|
"epoch": 0.17368762151652625, |
|
"grad_norm": 0.17311806443951758, |
|
"kl": 0.001552581787109375, |
|
"learning_rate": 9.855354252214307e-07, |
|
"loss": 0.0564, |
|
"reward": 1.152388408780098, |
|
"reward_std": 0.4479888826608658, |
|
"rewards/accuracy_reward": 0.7653061002492905, |
|
"rewards/improved_len_reward_dast": 0.3870823085308075, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 1699.9540405273438, |
|
"epoch": 0.1762799740764744, |
|
"grad_norm": 0.18795647394996712, |
|
"kl": 0.0012683868408203125, |
|
"learning_rate": 9.844898733238311e-07, |
|
"loss": 0.0538, |
|
"reward": 1.4352277517318726, |
|
"reward_std": 0.30926575139164925, |
|
"rewards/accuracy_reward": 0.867346927523613, |
|
"rewards/improved_len_reward_dast": 0.5678808689117432, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 1942.3876953125, |
|
"epoch": 0.17887232663642255, |
|
"grad_norm": 0.2210659776524768, |
|
"kl": 0.0016345977783203125, |
|
"learning_rate": 9.83408501538287e-07, |
|
"loss": -0.0183, |
|
"reward": 1.0560709834098816, |
|
"reward_std": 0.44945112615823746, |
|
"rewards/accuracy_reward": 0.7346938699483871, |
|
"rewards/improved_len_reward_dast": 0.32137710228562355, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 1671.9642639160156, |
|
"epoch": 0.18146467919637072, |
|
"grad_norm": 0.19750773670302219, |
|
"kl": 0.0015382766723632812, |
|
"learning_rate": 9.822913990144387e-07, |
|
"loss": 0.0167, |
|
"reward": 1.1308622658252716, |
|
"reward_std": 0.4337487518787384, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.2737194411456585, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 2116.3571166992188, |
|
"epoch": 0.18405703175631885, |
|
"grad_norm": 0.1778004806410334, |
|
"kl": 0.00168609619140625, |
|
"learning_rate": 9.811386578476146e-07, |
|
"loss": 0.0029, |
|
"reward": 1.2179836481809616, |
|
"reward_std": 0.46442168205976486, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.457779623568058, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 1906.9795532226562, |
|
"epoch": 0.18664938431626701, |
|
"grad_norm": 0.1986625505084921, |
|
"kl": 0.001316070556640625, |
|
"learning_rate": 9.79950373071236e-07, |
|
"loss": 0.0285, |
|
"reward": 1.1908049881458282, |
|
"reward_std": 0.3781607896089554, |
|
"rewards/accuracy_reward": 0.7244897931814194, |
|
"rewards/improved_len_reward_dast": 0.4663151800632477, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 1938.2652587890625, |
|
"epoch": 0.18924173687621518, |
|
"grad_norm": 0.178605084347928, |
|
"kl": 0.001659393310546875, |
|
"learning_rate": 9.787266426489845e-07, |
|
"loss": 0.0145, |
|
"reward": 1.233821153640747, |
|
"reward_std": 0.40631671994924545, |
|
"rewards/accuracy_reward": 0.7704081386327744, |
|
"rewards/improved_len_reward_dast": 0.46341295540332794, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 2097.5152587890625, |
|
"epoch": 0.1918340894361633, |
|
"grad_norm": 0.21993776817198404, |
|
"kl": 0.0017414093017578125, |
|
"learning_rate": 9.77467567466725e-07, |
|
"loss": 0.0586, |
|
"reward": 1.0030385106801987, |
|
"reward_std": 0.48096026852726936, |
|
"rewards/accuracy_reward": 0.6989795863628387, |
|
"rewards/improved_len_reward_dast": 0.30405890196561813, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 2267.7193298339844, |
|
"epoch": 0.19442644199611148, |
|
"grad_norm": 0.25966079935566605, |
|
"kl": 0.002155303955078125, |
|
"learning_rate": 9.761732513241882e-07, |
|
"loss": 0.1164, |
|
"reward": 1.1867494583129883, |
|
"reward_std": 0.36580438911914825, |
|
"rewards/accuracy_reward": 0.7346938699483871, |
|
"rewards/improved_len_reward_dast": 0.45205555111169815, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 1932.4285278320312, |
|
"epoch": 0.1970187945560596, |
|
"grad_norm": 0.18810468542751257, |
|
"kl": 0.0028076171875, |
|
"learning_rate": 9.748438009264142e-07, |
|
"loss": 0.0311, |
|
"reward": 1.302773892879486, |
|
"reward_std": 0.3699945732951164, |
|
"rewards/accuracy_reward": 0.8265306055545807, |
|
"rewards/improved_len_reward_dast": 0.4762432426214218, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 2192.2601928710938, |
|
"epoch": 0.19961114711600778, |
|
"grad_norm": 0.1818517530996337, |
|
"kl": 0.002178192138671875, |
|
"learning_rate": 9.734793258749538e-07, |
|
"loss": 0.0556, |
|
"reward": 1.2119455933570862, |
|
"reward_std": 0.33562129363417625, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.4517414830625057, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 2217.4693298339844, |
|
"epoch": 0.20220349967595594, |
|
"grad_norm": 0.17001135134898285, |
|
"kl": 0.002323150634765625, |
|
"learning_rate": 9.720799386588358e-07, |
|
"loss": 0.0214, |
|
"reward": 1.0081346929073334, |
|
"reward_std": 0.5323201268911362, |
|
"rewards/accuracy_reward": 0.6938775479793549, |
|
"rewards/improved_len_reward_dast": 0.3142571374773979, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 2039.5867309570312, |
|
"epoch": 0.20479585223590407, |
|
"grad_norm": 0.19848985839460778, |
|
"kl": 0.002605438232421875, |
|
"learning_rate": 9.706457546452898e-07, |
|
"loss": 0.0507, |
|
"reward": 1.1386294960975647, |
|
"reward_std": 0.3946889452636242, |
|
"rewards/accuracy_reward": 0.7448979541659355, |
|
"rewards/improved_len_reward_dast": 0.3937314935028553, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 2590.5305786132812, |
|
"epoch": 0.20738820479585224, |
|
"grad_norm": 0.15129066062202914, |
|
"kl": 0.002803802490234375, |
|
"learning_rate": 9.691768920702379e-07, |
|
"loss": -0.0267, |
|
"reward": 0.8391379117965698, |
|
"reward_std": 0.39438748359680176, |
|
"rewards/accuracy_reward": 0.5765306055545807, |
|
"rewards/improved_len_reward_dast": 0.26260728016495705, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 2176.096893310547, |
|
"epoch": 0.2099805573558004, |
|
"grad_norm": 0.18394525455650038, |
|
"kl": 0.00240325927734375, |
|
"learning_rate": 9.676734720285456e-07, |
|
"loss": 0.0667, |
|
"reward": 1.148956298828125, |
|
"reward_std": 0.34060123562812805, |
|
"rewards/accuracy_reward": 0.7448979467153549, |
|
"rewards/improved_len_reward_dast": 0.4040583297610283, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 2104.994842529297, |
|
"epoch": 0.21257290991574854, |
|
"grad_norm": 0.1783774193001553, |
|
"kl": 0.00263214111328125, |
|
"learning_rate": 9.661356184640394e-07, |
|
"loss": 0.0607, |
|
"reward": 1.300699919462204, |
|
"reward_std": 0.29261183738708496, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.5149856060743332, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 2017.9591674804688, |
|
"epoch": 0.2151652624756967, |
|
"grad_norm": 0.20548002392363018, |
|
"kl": 0.003589630126953125, |
|
"learning_rate": 9.64563458159288e-07, |
|
"loss": 0.0372, |
|
"reward": 1.2817473858594894, |
|
"reward_std": 0.42862868309020996, |
|
"rewards/accuracy_reward": 0.8265305906534195, |
|
"rewards/improved_len_reward_dast": 0.45521679520606995, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 2365.132568359375, |
|
"epoch": 0.21775761503564484, |
|
"grad_norm": 0.2118006180262065, |
|
"kl": 0.003673553466796875, |
|
"learning_rate": 9.629571207251515e-07, |
|
"loss": 0.0474, |
|
"reward": 1.1858174800872803, |
|
"reward_std": 0.42872869968414307, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.4256134256720543, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 2227.8111572265625, |
|
"epoch": 0.220349967595593, |
|
"grad_norm": 0.1730257242071835, |
|
"kl": 0.0032958984375, |
|
"learning_rate": 9.613167385900944e-07, |
|
"loss": 0.0116, |
|
"reward": 0.9865487962961197, |
|
"reward_std": 0.30924591794610023, |
|
"rewards/accuracy_reward": 0.6887754946947098, |
|
"rewards/improved_len_reward_dast": 0.2977732727304101, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 2069.8213806152344, |
|
"epoch": 0.22294232015554116, |
|
"grad_norm": 0.1997054811852766, |
|
"kl": 0.003353118896484375, |
|
"learning_rate": 9.59642446989269e-07, |
|
"loss": 0.0275, |
|
"reward": 1.2090528905391693, |
|
"reward_std": 0.4271962344646454, |
|
"rewards/accuracy_reward": 0.7806122303009033, |
|
"rewards/improved_len_reward_dast": 0.428440660238266, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 2234.255096435547, |
|
"epoch": 0.2255346727154893, |
|
"grad_norm": 0.1689278406473576, |
|
"kl": 0.0041046142578125, |
|
"learning_rate": 9.579343839533668e-07, |
|
"loss": 0.0395, |
|
"reward": 1.1342998147010803, |
|
"reward_std": 0.3173440955579281, |
|
"rewards/accuracy_reward": 0.739795908331871, |
|
"rewards/improved_len_reward_dast": 0.3945038840174675, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 2258.3009643554688, |
|
"epoch": 0.22812702527543746, |
|
"grad_norm": 0.19449538540190586, |
|
"kl": 0.004421234130859375, |
|
"learning_rate": 9.561926902972378e-07, |
|
"loss": 0.0785, |
|
"reward": 1.2548484802246094, |
|
"reward_std": 0.3709937259554863, |
|
"rewards/accuracy_reward": 0.7755101770162582, |
|
"rewards/improved_len_reward_dast": 0.47933831810951233, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 1870.6989440917969, |
|
"epoch": 0.23071937783538563, |
|
"grad_norm": 0.1864398126735164, |
|
"kl": 0.0042266845703125, |
|
"learning_rate": 9.544175096082838e-07, |
|
"loss": 0.0646, |
|
"reward": 1.4300118386745453, |
|
"reward_std": 0.4286029487848282, |
|
"rewards/accuracy_reward": 0.8928571343421936, |
|
"rewards/improved_len_reward_dast": 0.5371547788381577, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 2082.653045654297, |
|
"epoch": 0.23331173039533376, |
|
"grad_norm": 0.17766778571294792, |
|
"kl": 0.00475311279296875, |
|
"learning_rate": 9.526089882346172e-07, |
|
"loss": 0.032, |
|
"reward": 1.1855316758155823, |
|
"reward_std": 0.36463288590312004, |
|
"rewards/accuracy_reward": 0.7551020085811615, |
|
"rewards/improved_len_reward_dast": 0.4304296597838402, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 2117.2244262695312, |
|
"epoch": 0.23590408295528192, |
|
"grad_norm": 0.19874233088672905, |
|
"kl": 0.003894805908203125, |
|
"learning_rate": 9.507672752730001e-07, |
|
"loss": 0.052, |
|
"reward": 1.0779342502355576, |
|
"reward_std": 0.45030639320611954, |
|
"rewards/accuracy_reward": 0.734693855047226, |
|
"rewards/improved_len_reward_dast": 0.3432403616607189, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 2126.6173095703125, |
|
"epoch": 0.23849643551523006, |
|
"grad_norm": 0.20706633281686568, |
|
"kl": 0.004180908203125, |
|
"learning_rate": 9.4889252255655e-07, |
|
"loss": 0.0681, |
|
"reward": 1.1621150970458984, |
|
"reward_std": 0.2173718847334385, |
|
"rewards/accuracy_reward": 0.7295918315649033, |
|
"rewards/improved_len_reward_dast": 0.43252328783273697, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 2107.4692993164062, |
|
"epoch": 0.24108878807517822, |
|
"grad_norm": 0.18999527082233988, |
|
"kl": 0.00507354736328125, |
|
"learning_rate": 9.469848846422223e-07, |
|
"loss": 0.0305, |
|
"reward": 0.9012731686234474, |
|
"reward_std": 0.2958849798887968, |
|
"rewards/accuracy_reward": 0.6326530501246452, |
|
"rewards/improved_len_reward_dast": 0.2686200775206089, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 2329.5662841796875, |
|
"epoch": 0.2436811406351264, |
|
"grad_norm": 0.17793830796024995, |
|
"kl": 0.004726409912109375, |
|
"learning_rate": 9.450445187980699e-07, |
|
"loss": 0.0053, |
|
"reward": 1.0069625079631805, |
|
"reward_std": 0.4442039094865322, |
|
"rewards/accuracy_reward": 0.663265272974968, |
|
"rewards/improved_len_reward_dast": 0.3436972051858902, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 2371.1223754882812, |
|
"epoch": 0.24627349319507452, |
|
"grad_norm": 0.16551461901403783, |
|
"kl": 0.00560760498046875, |
|
"learning_rate": 9.430715849902774e-07, |
|
"loss": 0.0161, |
|
"reward": 1.1833973824977875, |
|
"reward_std": 0.3829594776034355, |
|
"rewards/accuracy_reward": 0.7551020309329033, |
|
"rewards/improved_len_reward_dast": 0.4282953441143036, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 1950.9897766113281, |
|
"epoch": 0.24886584575502269, |
|
"grad_norm": 0.22225719247681372, |
|
"kl": 0.004608154296875, |
|
"learning_rate": 9.410662458699723e-07, |
|
"loss": 0.0456, |
|
"reward": 1.138383835554123, |
|
"reward_std": 0.32722293585538864, |
|
"rewards/accuracy_reward": 0.7142857015132904, |
|
"rewards/improved_len_reward_dast": 0.4240981712937355, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 1459.1683349609375, |
|
"epoch": 0.25145819831497085, |
|
"grad_norm": 0.20670520181853694, |
|
"kl": 0.00476837158203125, |
|
"learning_rate": 9.390286667598169e-07, |
|
"loss": 0.0546, |
|
"reward": 1.3123253285884857, |
|
"reward_std": 0.31760613806545734, |
|
"rewards/accuracy_reward": 0.846938744187355, |
|
"rewards/improved_len_reward_dast": 0.4653865396976471, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 1836.9029846191406, |
|
"epoch": 0.254050550874919, |
|
"grad_norm": 0.20386220038181252, |
|
"kl": 0.00446319580078125, |
|
"learning_rate": 9.369590156403784e-07, |
|
"loss": 0.0339, |
|
"reward": 1.3093420267105103, |
|
"reward_std": 0.42256173491477966, |
|
"rewards/accuracy_reward": 0.8163265138864517, |
|
"rewards/improved_len_reward_dast": 0.49301546812057495, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 1921.7550354003906, |
|
"epoch": 0.2566429034348671, |
|
"grad_norm": 0.22385072499443348, |
|
"kl": 0.00586700439453125, |
|
"learning_rate": 9.348574631362808e-07, |
|
"loss": 0.0254, |
|
"reward": 1.369395136833191, |
|
"reward_std": 0.292521633207798, |
|
"rewards/accuracy_reward": 0.8367346823215485, |
|
"rewards/improved_len_reward_dast": 0.5326604098081589, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 1589.2550659179688, |
|
"epoch": 0.2592352559948153, |
|
"grad_norm": 0.23062182502361955, |
|
"kl": 0.003963470458984375, |
|
"learning_rate": 9.327241825021379e-07, |
|
"loss": 0.0939, |
|
"reward": 1.398920476436615, |
|
"reward_std": 0.34097858518362045, |
|
"rewards/accuracy_reward": 0.8979591578245163, |
|
"rewards/improved_len_reward_dast": 0.5009612441062927, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 1968.3979187011719, |
|
"epoch": 0.26182760855476345, |
|
"grad_norm": 0.19172453408443837, |
|
"kl": 0.0052337646484375, |
|
"learning_rate": 9.3055934960827e-07, |
|
"loss": 0.033, |
|
"reward": 1.2349633574485779, |
|
"reward_std": 0.4557712897658348, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.46455518156290054, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 2024.6580810546875, |
|
"epoch": 0.2644199611147116, |
|
"grad_norm": 0.18835419471758258, |
|
"kl": 0.00595855712890625, |
|
"learning_rate": 9.283631429262053e-07, |
|
"loss": -0.0018, |
|
"reward": 1.237942174077034, |
|
"reward_std": 0.4386955201625824, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.4522278979420662, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 2042.0101623535156, |
|
"epoch": 0.2670123136746598, |
|
"grad_norm": 0.16797444756904736, |
|
"kl": 0.00687408447265625, |
|
"learning_rate": 9.261357435139665e-07, |
|
"loss": 0.0127, |
|
"reward": 1.147979348897934, |
|
"reward_std": 0.39860222302377224, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.3877752497792244, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 1771.6785278320312, |
|
"epoch": 0.2696046662346079, |
|
"grad_norm": 0.19397130084636785, |
|
"kl": 0.00556182861328125, |
|
"learning_rate": 9.238773350011437e-07, |
|
"loss": 0.0329, |
|
"reward": 1.3575038313865662, |
|
"reward_std": 0.28452699072659016, |
|
"rewards/accuracy_reward": 0.8418367356061935, |
|
"rewards/improved_len_reward_dast": 0.5156671032309532, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 1984.2295532226562, |
|
"epoch": 0.27219701879455604, |
|
"grad_norm": 0.20491481745891912, |
|
"kl": 0.00533294677734375, |
|
"learning_rate": 9.215881035737557e-07, |
|
"loss": 0.0756, |
|
"reward": 1.3917469382286072, |
|
"reward_std": 0.3919885456562042, |
|
"rewards/accuracy_reward": 0.8673469126224518, |
|
"rewards/improved_len_reward_dast": 0.5244000777602196, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 2123.3570861816406, |
|
"epoch": 0.2747893713545042, |
|
"grad_norm": 0.19107859298960242, |
|
"kl": 0.00609588623046875, |
|
"learning_rate": 9.192682379589017e-07, |
|
"loss": 0.0343, |
|
"reward": 1.3419382572174072, |
|
"reward_std": 0.550883948802948, |
|
"rewards/accuracy_reward": 0.8163265287876129, |
|
"rewards/improved_len_reward_dast": 0.5256116688251495, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 2321.183563232422, |
|
"epoch": 0.27738172391445237, |
|
"grad_norm": 0.17417279176148165, |
|
"kl": 0.00618743896484375, |
|
"learning_rate": 9.169179294092006e-07, |
|
"loss": 0.037, |
|
"reward": 1.2553168833255768, |
|
"reward_std": 0.3132058009505272, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.49001070857048035, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 1755.6121826171875, |
|
"epoch": 0.27997407647440054, |
|
"grad_norm": 0.1910812285243796, |
|
"kl": 0.0055389404296875, |
|
"learning_rate": 9.145373716870257e-07, |
|
"loss": 0.0074, |
|
"reward": 1.1911440938711166, |
|
"reward_std": 0.47732261940836906, |
|
"rewards/accuracy_reward": 0.8265305906534195, |
|
"rewards/improved_len_reward_dast": 0.36461350694298744, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 2498.53564453125, |
|
"epoch": 0.2825664290343487, |
|
"grad_norm": 0.1847398357059974, |
|
"kl": 0.0076904296875, |
|
"learning_rate": 9.121267610485294e-07, |
|
"loss": 0.0136, |
|
"reward": 1.0379046350717545, |
|
"reward_std": 0.5191724747419357, |
|
"rewards/accuracy_reward": 0.6734693795442581, |
|
"rewards/improved_len_reward_dast": 0.36443524062633514, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 1881.5408020019531, |
|
"epoch": 0.2851587815942968, |
|
"grad_norm": 0.1895141382280174, |
|
"kl": 0.0063629150390625, |
|
"learning_rate": 9.096862962274642e-07, |
|
"loss": -0.0114, |
|
"reward": 1.2222436666488647, |
|
"reward_std": 0.2921589985489845, |
|
"rewards/accuracy_reward": 0.760204054415226, |
|
"rewards/improved_len_reward_dast": 0.4620395749807358, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 2229.341827392578, |
|
"epoch": 0.28775113415424497, |
|
"grad_norm": 0.16533064618080134, |
|
"kl": 0.00737762451171875, |
|
"learning_rate": 9.072161784187988e-07, |
|
"loss": 0.029, |
|
"reward": 1.213012382388115, |
|
"reward_std": 0.427090298384428, |
|
"rewards/accuracy_reward": 0.795918345451355, |
|
"rewards/improved_len_reward_dast": 0.41709401085972786, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 1740.8673400878906, |
|
"epoch": 0.29034348671419313, |
|
"grad_norm": 0.17704874550004857, |
|
"kl": 0.00606536865234375, |
|
"learning_rate": 9.047166112621312e-07, |
|
"loss": 0.0232, |
|
"reward": 1.3144700229167938, |
|
"reward_std": 0.3366679251194, |
|
"rewards/accuracy_reward": 0.8163264989852905, |
|
"rewards/improved_len_reward_dast": 0.4981435164809227, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 2048.397918701172, |
|
"epoch": 0.2929358392741413, |
|
"grad_norm": 0.19568646749424262, |
|
"kl": 0.00690460205078125, |
|
"learning_rate": 9.021878008249001e-07, |
|
"loss": 0.0206, |
|
"reward": 1.1744825094938278, |
|
"reward_std": 0.479649193584919, |
|
"rewards/accuracy_reward": 0.7806122303009033, |
|
"rewards/improved_len_reward_dast": 0.3938702493906021, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 1883.0255126953125, |
|
"epoch": 0.29552819183408946, |
|
"grad_norm": 0.201863471118327, |
|
"kl": 0.007293701171875, |
|
"learning_rate": 8.996299555853973e-07, |
|
"loss": 0.0263, |
|
"reward": 1.3593637347221375, |
|
"reward_std": 0.3963543549180031, |
|
"rewards/accuracy_reward": 0.8418367207050323, |
|
"rewards/improved_len_reward_dast": 0.5175270512700081, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 1779.4489135742188, |
|
"epoch": 0.29812054439403757, |
|
"grad_norm": 0.21073286141952957, |
|
"kl": 0.00705718994140625, |
|
"learning_rate": 8.970432864155798e-07, |
|
"loss": 0.059, |
|
"reward": 1.284899353981018, |
|
"reward_std": 0.3950739651918411, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.49408305436372757, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 1918.2244873046875, |
|
"epoch": 0.30071289695398573, |
|
"grad_norm": 0.19227538961602422, |
|
"kl": 0.00742340087890625, |
|
"learning_rate": 8.944280065636851e-07, |
|
"loss": 0.0454, |
|
"reward": 1.2475728243589401, |
|
"reward_std": 0.32171259075403214, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.4618585482239723, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 1858.4795532226562, |
|
"epoch": 0.3033052495139339, |
|
"grad_norm": 0.19238271005304078, |
|
"kl": 0.00749969482421875, |
|
"learning_rate": 8.917843316366515e-07, |
|
"loss": 0.0387, |
|
"reward": 1.364868402481079, |
|
"reward_std": 0.2818027026951313, |
|
"rewards/accuracy_reward": 0.8316326439380646, |
|
"rewards/improved_len_reward_dast": 0.533235713839531, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 1993.6224060058594, |
|
"epoch": 0.30589760207388206, |
|
"grad_norm": 0.231864346111992, |
|
"kl": 0.00769805908203125, |
|
"learning_rate": 8.891124795823426e-07, |
|
"loss": -0.0075, |
|
"reward": 1.1190623342990875, |
|
"reward_std": 0.2991497367620468, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.3282460141927004, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 1985.5509643554688, |
|
"epoch": 0.3084899546338302, |
|
"grad_norm": 0.17623896225871394, |
|
"kl": 0.00771331787109375, |
|
"learning_rate": 8.864126706715796e-07, |
|
"loss": 0.0186, |
|
"reward": 1.2160087823867798, |
|
"reward_std": 0.35445018485188484, |
|
"rewards/accuracy_reward": 0.7448979467153549, |
|
"rewards/improved_len_reward_dast": 0.4711107425391674, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 2125.1376953125, |
|
"epoch": 0.31108230719377833, |
|
"grad_norm": 0.2263640313290784, |
|
"kl": 0.0087432861328125, |
|
"learning_rate": 8.83685127479982e-07, |
|
"loss": 0.0941, |
|
"reward": 1.281501442193985, |
|
"reward_std": 0.38218285515904427, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.5110933035612106, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 1814.5611877441406, |
|
"epoch": 0.3136746597537265, |
|
"grad_norm": 0.19715675281839773, |
|
"kl": 0.007568359375, |
|
"learning_rate": 8.809300748696173e-07, |
|
"loss": 0.0386, |
|
"reward": 1.1133249253034592, |
|
"reward_std": 0.3796735033392906, |
|
"rewards/accuracy_reward": 0.7295918315649033, |
|
"rewards/improved_len_reward_dast": 0.38373304158449173, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 2427.4489135742188, |
|
"epoch": 0.31626701231367466, |
|
"grad_norm": 0.16760355775672944, |
|
"kl": 0.00905609130859375, |
|
"learning_rate": 8.781477399704652e-07, |
|
"loss": 0.0048, |
|
"reward": 1.0130163729190826, |
|
"reward_std": 0.4051677845418453, |
|
"rewards/accuracy_reward": 0.6632652878761292, |
|
"rewards/improved_len_reward_dast": 0.349751066416502, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 2251.3570861816406, |
|
"epoch": 0.3188593648736228, |
|
"grad_norm": 0.1882544168870131, |
|
"kl": 0.00846099853515625, |
|
"learning_rate": 8.753383521616902e-07, |
|
"loss": 0.0008, |
|
"reward": 1.1944599151611328, |
|
"reward_std": 0.4080551564693451, |
|
"rewards/accuracy_reward": 0.7499999850988388, |
|
"rewards/improved_len_reward_dast": 0.4444599226117134, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 1852.142822265625, |
|
"epoch": 0.321451717433571, |
|
"grad_norm": 0.22567456549295617, |
|
"kl": 0.007122039794921875, |
|
"learning_rate": 8.72502143052733e-07, |
|
"loss": 0.0421, |
|
"reward": 1.0371171534061432, |
|
"reward_std": 0.4070936441421509, |
|
"rewards/accuracy_reward": 0.6887754946947098, |
|
"rewards/improved_len_reward_dast": 0.34834159165620804, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 1902.4897766113281, |
|
"epoch": 0.32404406999351915, |
|
"grad_norm": 0.18976500768952323, |
|
"kl": 0.00728607177734375, |
|
"learning_rate": 8.696393464642158e-07, |
|
"loss": -0.0168, |
|
"reward": 1.379349261522293, |
|
"reward_std": 0.34975893795490265, |
|
"rewards/accuracy_reward": 0.8469387590885162, |
|
"rewards/improved_len_reward_dast": 0.5324105769395828, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 1687.3979187011719, |
|
"epoch": 0.32663642255346725, |
|
"grad_norm": 0.1842833719422884, |
|
"kl": 0.00609588623046875, |
|
"learning_rate": 8.667501984086655e-07, |
|
"loss": 0.0248, |
|
"reward": 1.3401367366313934, |
|
"reward_std": 0.26001402735710144, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.5544224381446838, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 1719.23974609375, |
|
"epoch": 0.3292287751134154, |
|
"grad_norm": 0.2122526031093734, |
|
"kl": 0.00665283203125, |
|
"learning_rate": 8.638349370710573e-07, |
|
"loss": 0.0493, |
|
"reward": 1.2587095499038696, |
|
"reward_std": 0.30533889308571815, |
|
"rewards/accuracy_reward": 0.8163264989852905, |
|
"rewards/improved_len_reward_dast": 0.4423830099403858, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 1702.78564453125, |
|
"epoch": 0.3318211276733636, |
|
"grad_norm": 0.18811783070011717, |
|
"kl": 0.00623321533203125, |
|
"learning_rate": 8.608938027891775e-07, |
|
"loss": 0.0049, |
|
"reward": 1.3044427931308746, |
|
"reward_std": 0.47574885934591293, |
|
"rewards/accuracy_reward": 0.806122437119484, |
|
"rewards/improved_len_reward_dast": 0.49832039326429367, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 1589.6376953125, |
|
"epoch": 0.33441348023331174, |
|
"grad_norm": 0.2122723729405287, |
|
"kl": 0.007274627685546875, |
|
"learning_rate": 8.579270380338107e-07, |
|
"loss": 0.0378, |
|
"reward": 1.3573221862316132, |
|
"reward_std": 0.40166376531124115, |
|
"rewards/accuracy_reward": 0.8469387590885162, |
|
"rewards/improved_len_reward_dast": 0.510383352637291, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 2209.2244873046875, |
|
"epoch": 0.3370058327932599, |
|
"grad_norm": 0.18766107651382932, |
|
"kl": 0.0082550048828125, |
|
"learning_rate": 8.549348873887496e-07, |
|
"loss": -0.035, |
|
"reward": 0.9989715814590454, |
|
"reward_std": 0.4630734659731388, |
|
"rewards/accuracy_reward": 0.6734693646430969, |
|
"rewards/improved_len_reward_dast": 0.32550226897001266, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 1750.2499694824219, |
|
"epoch": 0.339598185353208, |
|
"grad_norm": 0.26668844455154506, |
|
"kl": 0.0062713623046875, |
|
"learning_rate": 8.519175975306312e-07, |
|
"loss": 0.0733, |
|
"reward": 1.0193718448281288, |
|
"reward_std": 0.49021392315626144, |
|
"rewards/accuracy_reward": 0.6989795863628387, |
|
"rewards/improved_len_reward_dast": 0.3203922025859356, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 1834.892822265625, |
|
"epoch": 0.3421905379131562, |
|
"grad_norm": 0.17123158557193757, |
|
"kl": 0.006275177001953125, |
|
"learning_rate": 8.48875417208601e-07, |
|
"loss": 0.0191, |
|
"reward": 1.2724904865026474, |
|
"reward_std": 0.36864253878593445, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.5020823329687119, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 1844.9081115722656, |
|
"epoch": 0.34478289047310434, |
|
"grad_norm": 0.1744110793812119, |
|
"kl": 0.00693511962890625, |
|
"learning_rate": 8.458085972238048e-07, |
|
"loss": 0.0332, |
|
"reward": 1.0728662610054016, |
|
"reward_std": 0.4644254148006439, |
|
"rewards/accuracy_reward": 0.7499999850988388, |
|
"rewards/improved_len_reward_dast": 0.3228662498295307, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 1910.1427917480469, |
|
"epoch": 0.3473752430330525, |
|
"grad_norm": 0.22282630764089068, |
|
"kl": 0.0084686279296875, |
|
"learning_rate": 8.427173904087138e-07, |
|
"loss": 0.0291, |
|
"reward": 1.1172972619533539, |
|
"reward_std": 0.3814988359808922, |
|
"rewards/accuracy_reward": 0.7551020085811615, |
|
"rewards/improved_len_reward_dast": 0.36219523288309574, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 2461.3775329589844, |
|
"epoch": 0.34996759559300067, |
|
"grad_norm": 0.1595488734110434, |
|
"kl": 0.0104522705078125, |
|
"learning_rate": 8.396020516062794e-07, |
|
"loss": -0.0068, |
|
"reward": 0.9715078249573708, |
|
"reward_std": 0.3740999586880207, |
|
"rewards/accuracy_reward": 0.6173469200730324, |
|
"rewards/improved_len_reward_dast": 0.3541608899831772, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 1467.096908569336, |
|
"epoch": 0.3525599481529488, |
|
"grad_norm": 0.17905275908990426, |
|
"kl": 0.005458831787109375, |
|
"learning_rate": 8.364628376489242e-07, |
|
"loss": 0.0333, |
|
"reward": 1.558873325586319, |
|
"reward_std": 0.29448162391781807, |
|
"rewards/accuracy_reward": 0.928571417927742, |
|
"rewards/improved_len_reward_dast": 0.6303019374608994, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 1310.5, |
|
"epoch": 0.35515230071289694, |
|
"grad_norm": 0.20951329036509847, |
|
"kl": 0.0060577392578125, |
|
"learning_rate": 8.333000073373685e-07, |
|
"loss": -0.0166, |
|
"reward": 1.2859368920326233, |
|
"reward_std": 0.3338315784931183, |
|
"rewards/accuracy_reward": 0.8061224520206451, |
|
"rewards/improved_len_reward_dast": 0.47981445118784904, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 1815.6122436523438, |
|
"epoch": 0.3577446532728451, |
|
"grad_norm": 0.19604752185803775, |
|
"kl": 0.0070953369140625, |
|
"learning_rate": 8.301138214192945e-07, |
|
"loss": 0.0433, |
|
"reward": 1.2342120856046677, |
|
"reward_std": 0.4501468688249588, |
|
"rewards/accuracy_reward": 0.8010203987360001, |
|
"rewards/improved_len_reward_dast": 0.4331916607916355, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 1862.0764770507812, |
|
"epoch": 0.36033700583279327, |
|
"grad_norm": 0.18709921475186367, |
|
"kl": 0.0084228515625, |
|
"learning_rate": 8.269045425678497e-07, |
|
"loss": -0.011, |
|
"reward": 1.2167351096868515, |
|
"reward_std": 0.3770736940205097, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.45653103291988373, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 1736.1376953125, |
|
"epoch": 0.36292935839274143, |
|
"grad_norm": 0.19354018571685683, |
|
"kl": 0.0071258544921875, |
|
"learning_rate": 8.236724353599918e-07, |
|
"loss": 0.041, |
|
"reward": 1.496632605791092, |
|
"reward_std": 0.3335278294980526, |
|
"rewards/accuracy_reward": 0.8979591578245163, |
|
"rewards/improved_len_reward_dast": 0.5986734926700592, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 1628.4183654785156, |
|
"epoch": 0.36552171095268954, |
|
"grad_norm": 0.16803171468726585, |
|
"kl": 0.00705718994140625, |
|
"learning_rate": 8.204177662546763e-07, |
|
"loss": -0.0198, |
|
"reward": 1.2802585661411285, |
|
"reward_std": 0.3480174820870161, |
|
"rewards/accuracy_reward": 0.8163265138864517, |
|
"rewards/improved_len_reward_dast": 0.46393200755119324, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 1563.2244567871094, |
|
"epoch": 0.3681140635126377, |
|
"grad_norm": 0.21830948983629073, |
|
"kl": 0.006256103515625, |
|
"learning_rate": 8.171408035708906e-07, |
|
"loss": 0.0147, |
|
"reward": 1.477361023426056, |
|
"reward_std": 0.36876992136240005, |
|
"rewards/accuracy_reward": 0.8622448742389679, |
|
"rewards/improved_len_reward_dast": 0.6151161342859268, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 1426.9744567871094, |
|
"epoch": 0.37070641607258586, |
|
"grad_norm": 0.1829469047156503, |
|
"kl": 0.005870819091796875, |
|
"learning_rate": 8.138418174655323e-07, |
|
"loss": -0.0128, |
|
"reward": 1.475436508655548, |
|
"reward_std": 0.28024090081453323, |
|
"rewards/accuracy_reward": 0.8877550959587097, |
|
"rewards/improved_len_reward_dast": 0.5876814350485802, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 2269.73974609375, |
|
"epoch": 0.37329876863253403, |
|
"grad_norm": 0.15370768982629232, |
|
"kl": 0.00823974609375, |
|
"learning_rate": 8.105210799111366e-07, |
|
"loss": 0.029, |
|
"reward": 1.0333527326583862, |
|
"reward_std": 0.4238397367298603, |
|
"rewards/accuracy_reward": 0.6632652878761292, |
|
"rewards/improved_len_reward_dast": 0.37008739449083805, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 1661.2142333984375, |
|
"epoch": 0.3758911211924822, |
|
"grad_norm": 0.1756144937263373, |
|
"kl": 0.006439208984375, |
|
"learning_rate": 8.071788646734564e-07, |
|
"loss": 0.0278, |
|
"reward": 1.297868698835373, |
|
"reward_std": 0.30791742727160454, |
|
"rewards/accuracy_reward": 0.8163265138864517, |
|
"rewards/improved_len_reward_dast": 0.4815421551465988, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 1629.2754516601562, |
|
"epoch": 0.37848347375243035, |
|
"grad_norm": 0.19753853796416515, |
|
"kl": 0.006805419921875, |
|
"learning_rate": 8.038154472888909e-07, |
|
"loss": -0.0047, |
|
"reward": 1.2643596529960632, |
|
"reward_std": 0.403556901961565, |
|
"rewards/accuracy_reward": 0.806122437119484, |
|
"rewards/improved_len_reward_dast": 0.45823724940419197, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 1698.1785278320312, |
|
"epoch": 0.38107582631237846, |
|
"grad_norm": 0.18090958864036752, |
|
"kl": 0.00759124755859375, |
|
"learning_rate": 8.004311050417711e-07, |
|
"loss": -0.0063, |
|
"reward": 1.2380123734474182, |
|
"reward_std": 0.39292842149734497, |
|
"rewards/accuracy_reward": 0.7806122153997421, |
|
"rewards/improved_len_reward_dast": 0.4574001543223858, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 1603.7703704833984, |
|
"epoch": 0.3836681788723266, |
|
"grad_norm": 0.1689548990240542, |
|
"kl": 0.00655364990234375, |
|
"learning_rate": 7.970261169414999e-07, |
|
"loss": 0.0034, |
|
"reward": 1.2632354497909546, |
|
"reward_std": 0.42876998893916607, |
|
"rewards/accuracy_reward": 0.8010203838348389, |
|
"rewards/improved_len_reward_dast": 0.46221502870321274, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 2111.928497314453, |
|
"epoch": 0.3862605314322748, |
|
"grad_norm": 0.23403462014206552, |
|
"kl": 0.00902557373046875, |
|
"learning_rate": 7.936007636995497e-07, |
|
"loss": 0.0581, |
|
"reward": 1.1535758823156357, |
|
"reward_std": 0.33541079610586166, |
|
"rewards/accuracy_reward": 0.7091836556792259, |
|
"rewards/improved_len_reward_dast": 0.44439224898815155, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 1584.5560760498047, |
|
"epoch": 0.38885288399222295, |
|
"grad_norm": 0.19966714442908384, |
|
"kl": 0.00608062744140625, |
|
"learning_rate": 7.901553277063213e-07, |
|
"loss": -0.0136, |
|
"reward": 1.0925945341587067, |
|
"reward_std": 0.4660287909209728, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.3323905020952225, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 1963.030502319336, |
|
"epoch": 0.3914452365521711, |
|
"grad_norm": 0.17996728024183786, |
|
"kl": 0.0086822509765625, |
|
"learning_rate": 7.866900930078618e-07, |
|
"loss": 0.0058, |
|
"reward": 1.245696559548378, |
|
"reward_std": 0.4446266293525696, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.4854924902319908, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 1893.0254821777344, |
|
"epoch": 0.3940375891121192, |
|
"grad_norm": 0.16735022993158205, |
|
"kl": 0.007110595703125, |
|
"learning_rate": 7.832053452824489e-07, |
|
"loss": 0.0104, |
|
"reward": 1.2418105602264404, |
|
"reward_std": 0.4090575650334358, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.4714023545384407, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 1724.3111572265625, |
|
"epoch": 0.3966299416720674, |
|
"grad_norm": 0.1864010620729168, |
|
"kl": 0.00872802734375, |
|
"learning_rate": 7.797013718170384e-07, |
|
"loss": 0.0296, |
|
"reward": 1.1897482573986053, |
|
"reward_std": 0.3867075741291046, |
|
"rewards/accuracy_reward": 0.7755101919174194, |
|
"rewards/improved_len_reward_dast": 0.4142380841076374, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 1520.3673553466797, |
|
"epoch": 0.39922229423201555, |
|
"grad_norm": 0.19558753420229233, |
|
"kl": 0.006317138671875, |
|
"learning_rate": 7.761784614835801e-07, |
|
"loss": -0.0009, |
|
"reward": 1.1826948821544647, |
|
"reward_std": 0.44549785554409027, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.3969806134700775, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 1902.83154296875, |
|
"epoch": 0.4018146467919637, |
|
"grad_norm": 0.1628442801355898, |
|
"kl": 0.007907867431640625, |
|
"learning_rate": 7.726369047152029e-07, |
|
"loss": 0.0111, |
|
"reward": 1.1829434633255005, |
|
"reward_std": 0.4352233223617077, |
|
"rewards/accuracy_reward": 0.7346938699483871, |
|
"rewards/improved_len_reward_dast": 0.44824954867362976, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 1687.5867004394531, |
|
"epoch": 0.4044069993519119, |
|
"grad_norm": 0.15254799874290897, |
|
"kl": 0.0055694580078125, |
|
"learning_rate": 7.690769934822712e-07, |
|
"loss": 0.0209, |
|
"reward": 1.3427188694477081, |
|
"reward_std": 0.39824075251817703, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.5212903171777725, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 1699.2857055664062, |
|
"epoch": 0.40699935191186, |
|
"grad_norm": 0.17162045711276386, |
|
"kl": 0.00756072998046875, |
|
"learning_rate": 7.654990212683142e-07, |
|
"loss": 0.0029, |
|
"reward": 1.3672717213630676, |
|
"reward_std": 0.34800875186920166, |
|
"rewards/accuracy_reward": 0.8520407974720001, |
|
"rewards/improved_len_reward_dast": 0.5152308940887451, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 1642.4897766113281, |
|
"epoch": 0.40959170447180815, |
|
"grad_norm": 0.17781118941038052, |
|
"kl": 0.0069427490234375, |
|
"learning_rate": 7.619032830458307e-07, |
|
"loss": 0.0238, |
|
"reward": 1.36138716340065, |
|
"reward_std": 0.42799485474824905, |
|
"rewards/accuracy_reward": 0.8520407974720001, |
|
"rewards/improved_len_reward_dast": 0.5093463957309723, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 2058.10205078125, |
|
"epoch": 0.4121840570317563, |
|
"grad_norm": 0.21486100887413462, |
|
"kl": 0.00844573974609375, |
|
"learning_rate": 7.582900752519723e-07, |
|
"loss": 0.052, |
|
"reward": 1.2367046475410461, |
|
"reward_std": 0.4686100408434868, |
|
"rewards/accuracy_reward": 0.7857142835855484, |
|
"rewards/improved_len_reward_dast": 0.45099035650491714, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 2116.7601928710938, |
|
"epoch": 0.4147764095917045, |
|
"grad_norm": 0.21872883985010524, |
|
"kl": 0.00928497314453125, |
|
"learning_rate": 7.546596957641031e-07, |
|
"loss": 0.0469, |
|
"reward": 1.1451009958982468, |
|
"reward_std": 0.2814931422472, |
|
"rewards/accuracy_reward": 0.7244897782802582, |
|
"rewards/improved_len_reward_dast": 0.4206111915409565, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 2057.1172790527344, |
|
"epoch": 0.41736876215165264, |
|
"grad_norm": 0.223277485058984, |
|
"kl": 0.0099639892578125, |
|
"learning_rate": 7.510124438752432e-07, |
|
"loss": 0.0282, |
|
"reward": 1.2358856201171875, |
|
"reward_std": 0.42381204664707184, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.4501713886857033, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 1648.7907409667969, |
|
"epoch": 0.4199611147116008, |
|
"grad_norm": 0.19361427922643096, |
|
"kl": 0.007965087890625, |
|
"learning_rate": 7.473486202693949e-07, |
|
"loss": 0.0283, |
|
"reward": 1.5626276433467865, |
|
"reward_std": 0.33783891052007675, |
|
"rewards/accuracy_reward": 0.9081632643938065, |
|
"rewards/improved_len_reward_dast": 0.6544643938541412, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 1720.7805938720703, |
|
"epoch": 0.4225534672715489, |
|
"grad_norm": 0.22042630118078563, |
|
"kl": 0.008636474609375, |
|
"learning_rate": 7.43668526996753e-07, |
|
"loss": 0.0517, |
|
"reward": 1.203346148133278, |
|
"reward_std": 0.48596539348363876, |
|
"rewards/accuracy_reward": 0.7704081386327744, |
|
"rewards/improved_len_reward_dast": 0.43293796479701996, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 1918.5816345214844, |
|
"epoch": 0.4251458198314971, |
|
"grad_norm": 0.20825217508460148, |
|
"kl": 0.0105438232421875, |
|
"learning_rate": 7.399724674488046e-07, |
|
"loss": 0.0313, |
|
"reward": 1.2619640827178955, |
|
"reward_std": 0.3394176550209522, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.49665799736976624, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 1879.0867004394531, |
|
"epoch": 0.42773817239144524, |
|
"grad_norm": 0.20859456410748778, |
|
"kl": 0.00949859619140625, |
|
"learning_rate": 7.36260746333316e-07, |
|
"loss": 0.1032, |
|
"reward": 1.250516802072525, |
|
"reward_std": 0.21495914831757545, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.48521073907613754, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1788.2040405273438, |
|
"epoch": 0.4303305249513934, |
|
"grad_norm": 0.19365279193672524, |
|
"kl": 0.00925445556640625, |
|
"learning_rate": 7.325336696492128e-07, |
|
"loss": 0.031, |
|
"reward": 1.3934488892555237, |
|
"reward_std": 0.3679058402776718, |
|
"rewards/accuracy_reward": 0.867346927523613, |
|
"rewards/improved_len_reward_dast": 0.5261020287871361, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 2040.7346801757812, |
|
"epoch": 0.43292287751134156, |
|
"grad_norm": 0.1746728685861396, |
|
"kl": 0.010894775390625, |
|
"learning_rate": 7.287915446613531e-07, |
|
"loss": 0.0021, |
|
"reward": 1.270061433315277, |
|
"reward_std": 0.3740099295973778, |
|
"rewards/accuracy_reward": 0.8061224222183228, |
|
"rewards/improved_len_reward_dast": 0.46393903344869614, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 2118.234649658203, |
|
"epoch": 0.43551523007128967, |
|
"grad_norm": 0.20129074148639173, |
|
"kl": 0.013275146484375, |
|
"learning_rate": 7.250346798751953e-07, |
|
"loss": 0.006, |
|
"reward": 0.9839373528957367, |
|
"reward_std": 0.581517793238163, |
|
"rewards/accuracy_reward": 0.6785714030265808, |
|
"rewards/improved_len_reward_dast": 0.3053659498691559, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 1795.9540252685547, |
|
"epoch": 0.43810758263123784, |
|
"grad_norm": 0.1813953032982878, |
|
"kl": 0.009395599365234375, |
|
"learning_rate": 7.212633850113662e-07, |
|
"loss": 0.0235, |
|
"reward": 1.178409919142723, |
|
"reward_std": 0.4242382049560547, |
|
"rewards/accuracy_reward": 0.734693855047226, |
|
"rewards/improved_len_reward_dast": 0.44371599704027176, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 1421.1734619140625, |
|
"epoch": 0.440699935191186, |
|
"grad_norm": 0.18794137958282095, |
|
"kl": 0.008941650390625, |
|
"learning_rate": 7.174779709801253e-07, |
|
"loss": 0.0159, |
|
"reward": 1.4234746396541595, |
|
"reward_std": 0.32885606586933136, |
|
"rewards/accuracy_reward": 0.8622448742389679, |
|
"rewards/improved_len_reward_dast": 0.5612297654151917, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 1736.6632690429688, |
|
"epoch": 0.44329228775113416, |
|
"grad_norm": 0.22796049151575712, |
|
"kl": 0.009891510009765625, |
|
"learning_rate": 7.136787498557344e-07, |
|
"loss": 0.0088, |
|
"reward": 1.3514071702957153, |
|
"reward_std": 0.40995020419359207, |
|
"rewards/accuracy_reward": 0.846938744187355, |
|
"rewards/improved_len_reward_dast": 0.5044683739542961, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 1768.7193603515625, |
|
"epoch": 0.4458846403110823, |
|
"grad_norm": 0.25032479837006205, |
|
"kl": 0.010284423828125, |
|
"learning_rate": 7.098660348507293e-07, |
|
"loss": 0.0732, |
|
"reward": 1.269765853881836, |
|
"reward_std": 0.46360351890325546, |
|
"rewards/accuracy_reward": 0.7704081386327744, |
|
"rewards/improved_len_reward_dast": 0.4993576854467392, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 1956.9999694824219, |
|
"epoch": 0.44847699287103043, |
|
"grad_norm": 0.17507117871432235, |
|
"kl": 0.0093231201171875, |
|
"learning_rate": 7.060401402900977e-07, |
|
"loss": 0.0185, |
|
"reward": 1.1613440364599228, |
|
"reward_std": 0.5052430480718613, |
|
"rewards/accuracy_reward": 0.739795908331871, |
|
"rewards/improved_len_reward_dast": 0.42154809460043907, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 1834.2601623535156, |
|
"epoch": 0.4510693454309786, |
|
"grad_norm": 0.19217203672529928, |
|
"kl": 0.01007843017578125, |
|
"learning_rate": 7.022013815853672e-07, |
|
"loss": 0.0209, |
|
"reward": 1.0959883034229279, |
|
"reward_std": 0.47629018872976303, |
|
"rewards/accuracy_reward": 0.7295918166637421, |
|
"rewards/improved_len_reward_dast": 0.3663964793086052, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 1817.4489440917969, |
|
"epoch": 0.45366169799092676, |
|
"grad_norm": 0.19322905501288215, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 6.983500752086006e-07, |
|
"loss": 0.0448, |
|
"reward": 1.2833284437656403, |
|
"reward_std": 0.43457718193531036, |
|
"rewards/accuracy_reward": 0.795918345451355, |
|
"rewards/improved_len_reward_dast": 0.4874100536108017, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 1651.7244873046875, |
|
"epoch": 0.4562540505508749, |
|
"grad_norm": 0.19443121591302054, |
|
"kl": 0.00969696044921875, |
|
"learning_rate": 6.94486538666307e-07, |
|
"loss": 0.0327, |
|
"reward": 1.254166454076767, |
|
"reward_std": 0.4054510071873665, |
|
"rewards/accuracy_reward": 0.7806122452020645, |
|
"rewards/improved_len_reward_dast": 0.47355421632528305, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 1690.4234313964844, |
|
"epoch": 0.4588464031108231, |
|
"grad_norm": 0.2099852909442493, |
|
"kl": 0.0092010498046875, |
|
"learning_rate": 6.906110904732656e-07, |
|
"loss": -0.0115, |
|
"reward": 1.3241359293460846, |
|
"reward_std": 0.4749620705842972, |
|
"rewards/accuracy_reward": 0.8163265138864517, |
|
"rewards/improved_len_reward_dast": 0.5078093633055687, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 2150.1529541015625, |
|
"epoch": 0.46143875567077125, |
|
"grad_norm": 0.16262254100217993, |
|
"kl": 0.01073455810546875, |
|
"learning_rate": 6.867240501262666e-07, |
|
"loss": 0.0219, |
|
"reward": 1.3224327564239502, |
|
"reward_std": 0.31201132386922836, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.5622286796569824, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 1616.73974609375, |
|
"epoch": 0.46403110823071936, |
|
"grad_norm": 0.2054857790671321, |
|
"kl": 0.010406494140625, |
|
"learning_rate": 6.828257380777723e-07, |
|
"loss": -0.0028, |
|
"reward": 1.2023987025022507, |
|
"reward_std": 0.38464218378067017, |
|
"rewards/accuracy_reward": 0.8214285671710968, |
|
"rewards/improved_len_reward_dast": 0.38097016140818596, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 1939.9744567871094, |
|
"epoch": 0.4666234607906675, |
|
"grad_norm": 0.18969129476831767, |
|
"kl": 0.0137481689453125, |
|
"learning_rate": 6.789164757094978e-07, |
|
"loss": 0.035, |
|
"reward": 1.1967380195856094, |
|
"reward_std": 0.3427240923047066, |
|
"rewards/accuracy_reward": 0.734693855047226, |
|
"rewards/improved_len_reward_dast": 0.4620441570878029, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 1848.25, |
|
"epoch": 0.4692158133506157, |
|
"grad_norm": 0.18668896975291646, |
|
"kl": 0.011810302734375, |
|
"learning_rate": 6.749965853059164e-07, |
|
"loss": 0.0536, |
|
"reward": 1.3282198309898376, |
|
"reward_std": 0.4290488064289093, |
|
"rewards/accuracy_reward": 0.8520407974720001, |
|
"rewards/improved_len_reward_dast": 0.47617900371551514, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 1659.9489440917969, |
|
"epoch": 0.47180816591056385, |
|
"grad_norm": 0.2068391235436955, |
|
"kl": 0.0099334716796875, |
|
"learning_rate": 6.710663900276903e-07, |
|
"loss": 0.0149, |
|
"reward": 1.1044558137655258, |
|
"reward_std": 0.389005184173584, |
|
"rewards/accuracy_reward": 0.7244897931814194, |
|
"rewards/improved_len_reward_dast": 0.37996600940823555, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 1548.0152893066406, |
|
"epoch": 0.474400518470512, |
|
"grad_norm": 0.19942963085334378, |
|
"kl": 0.00998687744140625, |
|
"learning_rate": 6.671262138850274e-07, |
|
"loss": 0.0277, |
|
"reward": 1.4036801755428314, |
|
"reward_std": 0.325181283056736, |
|
"rewards/accuracy_reward": 0.846938744187355, |
|
"rewards/improved_len_reward_dast": 0.5567413941025734, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 1479.9234619140625, |
|
"epoch": 0.4769928710304601, |
|
"grad_norm": 0.17528837750916904, |
|
"kl": 0.00907135009765625, |
|
"learning_rate": 6.631763817109717e-07, |
|
"loss": 0.0212, |
|
"reward": 1.4963186979293823, |
|
"reward_std": 0.2380654364824295, |
|
"rewards/accuracy_reward": 0.8826530426740646, |
|
"rewards/improved_len_reward_dast": 0.6136656627058983, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 1625.2856750488281, |
|
"epoch": 0.4795852235904083, |
|
"grad_norm": 0.2340295745334256, |
|
"kl": 0.00994873046875, |
|
"learning_rate": 6.592172191346218e-07, |
|
"loss": 0.0387, |
|
"reward": 1.3299905359745026, |
|
"reward_std": 0.4121420457959175, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.5085620209574699, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 1799.586669921875, |
|
"epoch": 0.48217757615035645, |
|
"grad_norm": 0.208310701570096, |
|
"kl": 0.012359619140625, |
|
"learning_rate": 6.552490525542864e-07, |
|
"loss": 0.0341, |
|
"reward": 1.2161507308483124, |
|
"reward_std": 0.3565462492406368, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.4559466913342476, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 1612.836685180664, |
|
"epoch": 0.4847699287103046, |
|
"grad_norm": 0.1767048426760215, |
|
"kl": 0.0106048583984375, |
|
"learning_rate": 6.512722091105757e-07, |
|
"loss": -0.0013, |
|
"reward": 1.3248589038848877, |
|
"reward_std": 0.45474397391080856, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.5136343911290169, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1306.5509796142578, |
|
"epoch": 0.4873622812702528, |
|
"grad_norm": 0.212241902185087, |
|
"kl": 0.00981903076171875, |
|
"learning_rate": 6.472870166594314e-07, |
|
"loss": 0.0047, |
|
"reward": 1.4141908586025238, |
|
"reward_std": 0.4169772267341614, |
|
"rewards/accuracy_reward": 0.8418367058038712, |
|
"rewards/improved_len_reward_dast": 0.5723541006445885, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 1914.642822265625, |
|
"epoch": 0.4899546338302009, |
|
"grad_norm": 0.2520686184939368, |
|
"kl": 0.0127410888671875, |
|
"learning_rate": 6.432938037450974e-07, |
|
"loss": -0.0237, |
|
"reward": 1.1971821933984756, |
|
"reward_std": 0.3514118604362011, |
|
"rewards/accuracy_reward": 0.7499999850988388, |
|
"rewards/improved_len_reward_dast": 0.44718217849731445, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 1808.9183349609375, |
|
"epoch": 0.49254698639014904, |
|
"grad_norm": 0.2130749709969565, |
|
"kl": 0.01201629638671875, |
|
"learning_rate": 6.392928995730352e-07, |
|
"loss": 0.0412, |
|
"reward": 1.2710473388433456, |
|
"reward_std": 0.3865230418741703, |
|
"rewards/accuracy_reward": 0.7908163219690323, |
|
"rewards/improved_len_reward_dast": 0.48023101314902306, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 1365.4795837402344, |
|
"epoch": 0.4951393389500972, |
|
"grad_norm": 0.250237755024117, |
|
"kl": 0.00952911376953125, |
|
"learning_rate": 6.352846339827826e-07, |
|
"loss": 0.095, |
|
"reward": 1.5109961926937103, |
|
"reward_std": 0.30784352123737335, |
|
"rewards/accuracy_reward": 0.9132653027772903, |
|
"rewards/improved_len_reward_dast": 0.5977308824658394, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 1425.2755126953125, |
|
"epoch": 0.49773169151004537, |
|
"grad_norm": 0.22368363257945995, |
|
"kl": 0.0114288330078125, |
|
"learning_rate": 6.312693374207627e-07, |
|
"loss": 0.0195, |
|
"reward": 1.2838004529476166, |
|
"reward_std": 0.46850764751434326, |
|
"rewards/accuracy_reward": 0.8265306055545807, |
|
"rewards/improved_len_reward_dast": 0.4572698399424553, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 1588.5101623535156, |
|
"epoch": 0.5003240440699935, |
|
"grad_norm": 0.20204139731047027, |
|
"kl": 0.01300048828125, |
|
"learning_rate": 6.272473409130397e-07, |
|
"loss": 0.0012, |
|
"reward": 1.3159003108739853, |
|
"reward_std": 0.4093224108219147, |
|
"rewards/accuracy_reward": 0.8316326439380646, |
|
"rewards/improved_len_reward_dast": 0.484267670661211, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 1411.3571166992188, |
|
"epoch": 0.5029163966299417, |
|
"grad_norm": 0.19443397701968118, |
|
"kl": 0.00821685791015625, |
|
"learning_rate": 6.232189760380301e-07, |
|
"loss": 0.0224, |
|
"reward": 1.288124531507492, |
|
"reward_std": 0.3209230378270149, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.5024102553725243, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 1751.6785278320312, |
|
"epoch": 0.5055087491898899, |
|
"grad_norm": 0.18304814418314927, |
|
"kl": 0.0109100341796875, |
|
"learning_rate": 6.191845748991671e-07, |
|
"loss": -0.007, |
|
"reward": 1.0736610293388367, |
|
"reward_std": 0.32857421785593033, |
|
"rewards/accuracy_reward": 0.6581632494926453, |
|
"rewards/improved_len_reward_dast": 0.41549770161509514, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 1771.5968933105469, |
|
"epoch": 0.508101101749838, |
|
"grad_norm": 0.20612952277089522, |
|
"kl": 0.0137939453125, |
|
"learning_rate": 6.151444700975203e-07, |
|
"loss": 0.0106, |
|
"reward": 1.360820233821869, |
|
"reward_std": 0.38221075385808945, |
|
"rewards/accuracy_reward": 0.8418367207050323, |
|
"rewards/improved_len_reward_dast": 0.518983505666256, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 2076.3060913085938, |
|
"epoch": 0.5106934543097861, |
|
"grad_norm": 0.22320859434163112, |
|
"kl": 0.0132293701171875, |
|
"learning_rate": 6.110989947043767e-07, |
|
"loss": 0.0519, |
|
"reward": 1.101119041442871, |
|
"reward_std": 0.4651700109243393, |
|
"rewards/accuracy_reward": 0.7244897931814194, |
|
"rewards/improved_len_reward_dast": 0.37662921100854874, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 1513.6530151367188, |
|
"epoch": 0.5132858068697342, |
|
"grad_norm": 0.24160481879222073, |
|
"kl": 0.0120849609375, |
|
"learning_rate": 6.070484822337816e-07, |
|
"loss": 0.0617, |
|
"reward": 1.3807711601257324, |
|
"reward_std": 0.30266276001930237, |
|
"rewards/accuracy_reward": 0.8622448742389679, |
|
"rewards/improved_len_reward_dast": 0.5185262858867645, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 1659.4744262695312, |
|
"epoch": 0.5158781594296824, |
|
"grad_norm": 0.2860111752617934, |
|
"kl": 0.0122528076171875, |
|
"learning_rate": 6.029932666150431e-07, |
|
"loss": 0.0487, |
|
"reward": 1.27889584004879, |
|
"reward_std": 0.40974466502666473, |
|
"rewards/accuracy_reward": 0.8010203987360001, |
|
"rewards/improved_len_reward_dast": 0.4778754487633705, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 1553.6479187011719, |
|
"epoch": 0.5184705119896306, |
|
"grad_norm": 0.17284042761570728, |
|
"kl": 0.0113372802734375, |
|
"learning_rate": 5.989336821652029e-07, |
|
"loss": -0.0157, |
|
"reward": 1.292808324098587, |
|
"reward_std": 0.3536081798374653, |
|
"rewards/accuracy_reward": 0.7755101919174194, |
|
"rewards/improved_len_reward_dast": 0.517298124730587, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1221.6734313964844, |
|
"epoch": 0.5210628645495787, |
|
"grad_norm": 0.20576387898105802, |
|
"kl": 0.00975799560546875, |
|
"learning_rate": 5.948700635614745e-07, |
|
"loss": 0.0155, |
|
"reward": 1.043928012251854, |
|
"reward_std": 0.5074506774544716, |
|
"rewards/accuracy_reward": 0.734693855047226, |
|
"rewards/improved_len_reward_dast": 0.3092341625597328, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1443.3367156982422, |
|
"epoch": 0.5236552171095269, |
|
"grad_norm": 0.190656293014884, |
|
"kl": 0.01007080078125, |
|
"learning_rate": 5.908027458136518e-07, |
|
"loss": 0.027, |
|
"reward": 1.5769412517547607, |
|
"reward_std": 0.27542993798851967, |
|
"rewards/accuracy_reward": 0.9081632494926453, |
|
"rewards/improved_len_reward_dast": 0.6687779873609543, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1383.1325988769531, |
|
"epoch": 0.5262475696694751, |
|
"grad_norm": 0.18700146403961007, |
|
"kl": 0.00789642333984375, |
|
"learning_rate": 5.867320642364916e-07, |
|
"loss": -0.0, |
|
"reward": 1.4069096446037292, |
|
"reward_std": 0.452865906059742, |
|
"rewards/accuracy_reward": 0.8571428507566452, |
|
"rewards/improved_len_reward_dast": 0.5497667863965034, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 1636.7448425292969, |
|
"epoch": 0.5288399222294232, |
|
"grad_norm": 0.18621798443065538, |
|
"kl": 0.01001739501953125, |
|
"learning_rate": 5.826583544220678e-07, |
|
"loss": 0.0023, |
|
"reward": 1.1149714589118958, |
|
"reward_std": 0.5129830092191696, |
|
"rewards/accuracy_reward": 0.739795908331871, |
|
"rewards/improved_len_reward_dast": 0.3751755505800247, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 1296.4540252685547, |
|
"epoch": 0.5314322747893714, |
|
"grad_norm": 0.24973009441281563, |
|
"kl": 0.00960540771484375, |
|
"learning_rate": 5.78581952212107e-07, |
|
"loss": 0.057, |
|
"reward": 1.439581423997879, |
|
"reward_std": 0.20332731679081917, |
|
"rewards/accuracy_reward": 0.8775510191917419, |
|
"rewards/improved_len_reward_dast": 0.5620303899049759, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 1675.2040405273438, |
|
"epoch": 0.5340246273493195, |
|
"grad_norm": 0.17994542833868402, |
|
"kl": 0.0113983154296875, |
|
"learning_rate": 5.745031936702997e-07, |
|
"loss": 0.0212, |
|
"reward": 1.236918032169342, |
|
"reward_std": 0.4141309931874275, |
|
"rewards/accuracy_reward": 0.7755101919174194, |
|
"rewards/improved_len_reward_dast": 0.46140778064727783, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1685.6376953125, |
|
"epoch": 0.5366169799092677, |
|
"grad_norm": 0.19387833193950482, |
|
"kl": 0.0142364501953125, |
|
"learning_rate": 5.704224150545956e-07, |
|
"loss": 0.0032, |
|
"reward": 1.1570499688386917, |
|
"reward_std": 0.4146932289004326, |
|
"rewards/accuracy_reward": 0.739795908331871, |
|
"rewards/improved_len_reward_dast": 0.4172540530562401, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 1249.0101928710938, |
|
"epoch": 0.5392093324692158, |
|
"grad_norm": 0.1923070203823955, |
|
"kl": 0.0085906982421875, |
|
"learning_rate": 5.663399527894816e-07, |
|
"loss": 0.0138, |
|
"reward": 1.4272409826517105, |
|
"reward_std": 0.34243838489055634, |
|
"rewards/accuracy_reward": 0.8622448742389679, |
|
"rewards/improved_len_reward_dast": 0.5649960786104202, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 1525.1734313964844, |
|
"epoch": 0.5418016850291639, |
|
"grad_norm": 0.19609225255735566, |
|
"kl": 0.01036834716796875, |
|
"learning_rate": 5.622561434382467e-07, |
|
"loss": 0.0011, |
|
"reward": 1.1873522847890854, |
|
"reward_std": 0.4918947294354439, |
|
"rewards/accuracy_reward": 0.8010203838348389, |
|
"rewards/improved_len_reward_dast": 0.386331919580698, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 1988.4591064453125, |
|
"epoch": 0.5443940375891121, |
|
"grad_norm": 0.2322805815292897, |
|
"kl": 0.0143280029296875, |
|
"learning_rate": 5.581713236752361e-07, |
|
"loss": 0.0289, |
|
"reward": 1.1922202408313751, |
|
"reward_std": 0.2860515546053648, |
|
"rewards/accuracy_reward": 0.7244897782802582, |
|
"rewards/improved_len_reward_dast": 0.46773041412234306, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 1433.290771484375, |
|
"epoch": 0.5469863901490603, |
|
"grad_norm": 0.2984688713886969, |
|
"kl": 0.0114898681640625, |
|
"learning_rate": 5.540858302580934e-07, |
|
"loss": 0.0818, |
|
"reward": 1.3492214977741241, |
|
"reward_std": 0.3557019531726837, |
|
"rewards/accuracy_reward": 0.8622448742389679, |
|
"rewards/improved_len_reward_dast": 0.48697663098573685, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 1686.086669921875, |
|
"epoch": 0.5495787427090084, |
|
"grad_norm": 0.17323504261296585, |
|
"kl": 0.01081085205078125, |
|
"learning_rate": 5.5e-07, |
|
"loss": -0.0227, |
|
"reward": 0.910240039229393, |
|
"reward_std": 0.49440842866897583, |
|
"rewards/accuracy_reward": 0.6632653027772903, |
|
"rewards/improved_len_reward_dast": 0.24697477743029594, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1503.3571166992188, |
|
"epoch": 0.5521710952689566, |
|
"grad_norm": 0.19940687047680583, |
|
"kl": 0.0108795166015625, |
|
"learning_rate": 5.459141697419066e-07, |
|
"loss": 0.0196, |
|
"reward": 1.414816826581955, |
|
"reward_std": 0.24907327815890312, |
|
"rewards/accuracy_reward": 0.8622448742389679, |
|
"rewards/improved_len_reward_dast": 0.5525719411671162, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 1326.4744720458984, |
|
"epoch": 0.5547634478289047, |
|
"grad_norm": 0.1968213437884411, |
|
"kl": 0.00897216796875, |
|
"learning_rate": 5.418286763247641e-07, |
|
"loss": 0.0333, |
|
"reward": 1.5710687637329102, |
|
"reward_std": 0.27853039279580116, |
|
"rewards/accuracy_reward": 0.9336734712123871, |
|
"rewards/improved_len_reward_dast": 0.6373953074216843, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 1814.7856750488281, |
|
"epoch": 0.5573558003888529, |
|
"grad_norm": 0.1910754560182501, |
|
"kl": 0.0157623291015625, |
|
"learning_rate": 5.377438565617532e-07, |
|
"loss": 0.0053, |
|
"reward": 1.1130409240722656, |
|
"reward_std": 0.5712603330612183, |
|
"rewards/accuracy_reward": 0.7091836780309677, |
|
"rewards/improved_len_reward_dast": 0.4038572832942009, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 2041.4693603515625, |
|
"epoch": 0.5599481529488011, |
|
"grad_norm": 0.19528431114703992, |
|
"kl": 0.017974853515625, |
|
"learning_rate": 5.336600472105186e-07, |
|
"loss": 0.0026, |
|
"reward": 1.1326239556074142, |
|
"reward_std": 0.5115986987948418, |
|
"rewards/accuracy_reward": 0.7193877249956131, |
|
"rewards/improved_len_reward_dast": 0.41323617100715637, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 1490.438720703125, |
|
"epoch": 0.5625405055087492, |
|
"grad_norm": 0.1818395863982982, |
|
"kl": 0.011444091796875, |
|
"learning_rate": 5.295775849454045e-07, |
|
"loss": -0.025, |
|
"reward": 1.1338547468185425, |
|
"reward_std": 0.26832524314522743, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/improved_len_reward_dast": 0.3838547393679619, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 1993.8571166992188, |
|
"epoch": 0.5651328580686974, |
|
"grad_norm": 0.23754078779498058, |
|
"kl": 0.0171356201171875, |
|
"learning_rate": 5.254968063297003e-07, |
|
"loss": -0.0245, |
|
"reward": 1.088214099407196, |
|
"reward_std": 0.33989886194467545, |
|
"rewards/accuracy_reward": 0.6938775330781937, |
|
"rewards/improved_len_reward_dast": 0.3943365402519703, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 1916.8775024414062, |
|
"epoch": 0.5677252106286454, |
|
"grad_norm": 0.23169329147427764, |
|
"kl": 0.0146942138671875, |
|
"learning_rate": 5.214180477878931e-07, |
|
"loss": -0.0216, |
|
"reward": 1.1535532772541046, |
|
"reward_std": 0.5523173958063126, |
|
"rewards/accuracy_reward": 0.739795908331871, |
|
"rewards/improved_len_reward_dast": 0.4137573465704918, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 2072.586700439453, |
|
"epoch": 0.5703175631885936, |
|
"grad_norm": 0.179237513002948, |
|
"kl": 0.0157623291015625, |
|
"learning_rate": 5.173416455779323e-07, |
|
"loss": 0.0061, |
|
"reward": 1.129465639591217, |
|
"reward_std": 0.47254087403416634, |
|
"rewards/accuracy_reward": 0.7397958934307098, |
|
"rewards/improved_len_reward_dast": 0.3896697536110878, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 1500.7499694824219, |
|
"epoch": 0.5729099157485418, |
|
"grad_norm": 0.18878843129064268, |
|
"kl": 0.01107025146484375, |
|
"learning_rate": 5.132679357635086e-07, |
|
"loss": -0.0142, |
|
"reward": 1.1763963997364044, |
|
"reward_std": 0.48718392848968506, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.40598829090595245, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 1644.9030151367188, |
|
"epoch": 0.5755022683084899, |
|
"grad_norm": 0.17742073908553643, |
|
"kl": 0.0126495361328125, |
|
"learning_rate": 5.091972541863481e-07, |
|
"loss": 0.0186, |
|
"reward": 1.1986051201820374, |
|
"reward_std": 0.4172977935522795, |
|
"rewards/accuracy_reward": 0.734693855047226, |
|
"rewards/improved_len_reward_dast": 0.463911272585392, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 1161.091812133789, |
|
"epoch": 0.5780946208684381, |
|
"grad_norm": 0.189357723748229, |
|
"kl": 0.00917816162109375, |
|
"learning_rate": 5.051299364385257e-07, |
|
"loss": 0.0034, |
|
"reward": 1.5119259655475616, |
|
"reward_std": 0.34742674231529236, |
|
"rewards/accuracy_reward": 0.9030611962080002, |
|
"rewards/improved_len_reward_dast": 0.6088647544384003, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 2160.7142944335938, |
|
"epoch": 0.5806869734283863, |
|
"grad_norm": 0.1958816052872559, |
|
"kl": 0.0196075439453125, |
|
"learning_rate": 5.010663178347971e-07, |
|
"loss": 0.0345, |
|
"reward": 1.2357909381389618, |
|
"reward_std": 0.4518684595823288, |
|
"rewards/accuracy_reward": 0.7448979318141937, |
|
"rewards/improved_len_reward_dast": 0.4908929914236069, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1368.7703552246094, |
|
"epoch": 0.5832793259883344, |
|
"grad_norm": 0.2126816864157868, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 4.970067333849568e-07, |
|
"loss": 0.0421, |
|
"reward": 1.3800954520702362, |
|
"reward_std": 0.24764511361718178, |
|
"rewards/accuracy_reward": 0.8163265287876129, |
|
"rewards/improved_len_reward_dast": 0.5637688413262367, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 1523.7958984375, |
|
"epoch": 0.5858716785482826, |
|
"grad_norm": 0.2103498219912096, |
|
"kl": 0.013336181640625, |
|
"learning_rate": 4.929515177662182e-07, |
|
"loss": 0.0336, |
|
"reward": 1.3088043332099915, |
|
"reward_std": 0.3938099816441536, |
|
"rewards/accuracy_reward": 0.8214285671710968, |
|
"rewards/improved_len_reward_dast": 0.48737573623657227, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 1753.9897766113281, |
|
"epoch": 0.5884640311082308, |
|
"grad_norm": 0.17623732882686455, |
|
"kl": 0.0133514404296875, |
|
"learning_rate": 4.889010052956233e-07, |
|
"loss": 0.0184, |
|
"reward": 1.1956195682287216, |
|
"reward_std": 0.38174545764923096, |
|
"rewards/accuracy_reward": 0.7551020234823227, |
|
"rewards/improved_len_reward_dast": 0.44051752984523773, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1186.4795837402344, |
|
"epoch": 0.5910563836681789, |
|
"grad_norm": 0.19103765244425439, |
|
"kl": 0.00911712646484375, |
|
"learning_rate": 4.848555299024798e-07, |
|
"loss": -0.0025, |
|
"reward": 1.3858640789985657, |
|
"reward_std": 0.2998353075236082, |
|
"rewards/accuracy_reward": 0.8724489808082581, |
|
"rewards/improved_len_reward_dast": 0.5134151205420494, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 1717.0713806152344, |
|
"epoch": 0.593648736228127, |
|
"grad_norm": 0.1787260124676487, |
|
"kl": 0.01560211181640625, |
|
"learning_rate": 4.80815425100833e-07, |
|
"loss": 0.0131, |
|
"reward": 1.2940033674240112, |
|
"reward_std": 0.3880784399807453, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.5031870305538177, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 1570.3979187011719, |
|
"epoch": 0.5962410887880751, |
|
"grad_norm": 0.1932563584259016, |
|
"kl": 0.0125732421875, |
|
"learning_rate": 4.7678102396196983e-07, |
|
"loss": 0.0028, |
|
"reward": 1.194681242108345, |
|
"reward_std": 0.36879952996969223, |
|
"rewards/accuracy_reward": 0.7704081386327744, |
|
"rewards/improved_len_reward_dast": 0.4242731127887964, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1627.1173400878906, |
|
"epoch": 0.5988334413480233, |
|
"grad_norm": 0.20069193255347081, |
|
"kl": 0.01148223876953125, |
|
"learning_rate": 4.727526590869605e-07, |
|
"loss": -0.0024, |
|
"reward": 1.2599404603242874, |
|
"reward_std": 0.3717983737587929, |
|
"rewards/accuracy_reward": 0.8061224222183228, |
|
"rewards/improved_len_reward_dast": 0.45381802320480347, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1422.693832397461, |
|
"epoch": 0.6014257939079715, |
|
"grad_norm": 0.22397903045763606, |
|
"kl": 0.011993408203125, |
|
"learning_rate": 4.6873066257923735e-07, |
|
"loss": -0.0198, |
|
"reward": 1.1824947893619537, |
|
"reward_std": 0.3314864858984947, |
|
"rewards/accuracy_reward": 0.7806122153997421, |
|
"rewards/improved_len_reward_dast": 0.4018825590610504, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 2077.2550659179688, |
|
"epoch": 0.6040181464679196, |
|
"grad_norm": 0.2622807945246562, |
|
"kl": 0.0151519775390625, |
|
"learning_rate": 4.647153660172173e-07, |
|
"loss": 0.0607, |
|
"reward": 1.1635594964027405, |
|
"reward_std": 0.392416313290596, |
|
"rewards/accuracy_reward": 0.7499999701976776, |
|
"rewards/improved_len_reward_dast": 0.4135594889521599, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 1738.4336547851562, |
|
"epoch": 0.6066104990278678, |
|
"grad_norm": 0.24814578097643056, |
|
"kl": 0.01483917236328125, |
|
"learning_rate": 4.607071004269647e-07, |
|
"loss": 0.031, |
|
"reward": 1.369605004787445, |
|
"reward_std": 0.3843038082122803, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.5583804696798325, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 1602.0713806152344, |
|
"epoch": 0.609202851587816, |
|
"grad_norm": 0.2094489678458985, |
|
"kl": 0.01458740234375, |
|
"learning_rate": 4.567061962549025e-07, |
|
"loss": -0.0277, |
|
"reward": 1.1768890023231506, |
|
"reward_std": 0.5075602382421494, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.4115828797221184, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 1883.586669921875, |
|
"epoch": 0.6117952041477641, |
|
"grad_norm": 0.18539849926073623, |
|
"kl": 0.01873779296875, |
|
"learning_rate": 4.527129833405687e-07, |
|
"loss": 0.0234, |
|
"reward": 1.2962508648633957, |
|
"reward_std": 0.23112722299993038, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.5309447646141052, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1541.188720703125, |
|
"epoch": 0.6143875567077123, |
|
"grad_norm": 0.2211580384146908, |
|
"kl": 0.013671875, |
|
"learning_rate": 4.4872779088942425e-07, |
|
"loss": 0.027, |
|
"reward": 1.3446270525455475, |
|
"reward_std": 0.4020156227052212, |
|
"rewards/accuracy_reward": 0.8265305906534195, |
|
"rewards/improved_len_reward_dast": 0.5180964693427086, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 1877.1122131347656, |
|
"epoch": 0.6169799092676604, |
|
"grad_norm": 0.27937868976565, |
|
"kl": 0.0175018310546875, |
|
"learning_rate": 4.447509474457135e-07, |
|
"loss": -0.0519, |
|
"reward": 1.3078001737594604, |
|
"reward_std": 0.3943771682679653, |
|
"rewards/accuracy_reward": 0.811224490404129, |
|
"rewards/improved_len_reward_dast": 0.49657563865184784, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 1735.6836547851562, |
|
"epoch": 0.6195722618276086, |
|
"grad_norm": 0.19004402096856263, |
|
"kl": 0.013519287109375, |
|
"learning_rate": 4.4078278086537823e-07, |
|
"loss": 0.019, |
|
"reward": 1.430199384689331, |
|
"reward_std": 0.45470841974020004, |
|
"rewards/accuracy_reward": 0.8418367207050323, |
|
"rewards/improved_len_reward_dast": 0.5883626788854599, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 1290.8877258300781, |
|
"epoch": 0.6221646143875567, |
|
"grad_norm": 0.20039034607000805, |
|
"kl": 0.00916290283203125, |
|
"learning_rate": 4.3682361828902846e-07, |
|
"loss": 0.0204, |
|
"reward": 1.4429042339324951, |
|
"reward_std": 0.40230638161301613, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.5857614576816559, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1543.5713958740234, |
|
"epoch": 0.6247569669475048, |
|
"grad_norm": 0.1796128893155037, |
|
"kl": 0.0121002197265625, |
|
"learning_rate": 4.328737861149726e-07, |
|
"loss": 0.0061, |
|
"reward": 1.060480311512947, |
|
"reward_std": 0.4090285710990429, |
|
"rewards/accuracy_reward": 0.7040816247463226, |
|
"rewards/improved_len_reward_dast": 0.35639870166778564, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1650.6581420898438, |
|
"epoch": 0.627349319507453, |
|
"grad_norm": 0.17035045538288204, |
|
"kl": 0.0127410888671875, |
|
"learning_rate": 4.289336099723098e-07, |
|
"loss": -0.0068, |
|
"reward": 1.2868027091026306, |
|
"reward_std": 0.4846101552248001, |
|
"rewards/accuracy_reward": 0.795918345451355, |
|
"rewards/improved_len_reward_dast": 0.49088432639837265, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 1806.8724212646484, |
|
"epoch": 0.6299416720674011, |
|
"grad_norm": 0.21153725027052578, |
|
"kl": 0.01531982421875, |
|
"learning_rate": 4.250034146940834e-07, |
|
"loss": 0.0342, |
|
"reward": 1.3773571997880936, |
|
"reward_std": 0.32580330967903137, |
|
"rewards/accuracy_reward": 0.8265305906534195, |
|
"rewards/improved_len_reward_dast": 0.5508265644311905, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 1506.8877410888672, |
|
"epoch": 0.6325340246273493, |
|
"grad_norm": 0.20274200364313702, |
|
"kl": 0.01300048828125, |
|
"learning_rate": 4.210835242905023e-07, |
|
"loss": 0.0114, |
|
"reward": 1.3944001197814941, |
|
"reward_std": 0.35993905924260616, |
|
"rewards/accuracy_reward": 0.867346927523613, |
|
"rewards/improved_len_reward_dast": 0.5270532071590424, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1694.5713806152344, |
|
"epoch": 0.6351263771872975, |
|
"grad_norm": 0.20631633070295144, |
|
"kl": 0.01531982421875, |
|
"learning_rate": 4.1717426192222784e-07, |
|
"loss": 0.001, |
|
"reward": 1.269565299153328, |
|
"reward_std": 0.3799453191459179, |
|
"rewards/accuracy_reward": 0.7908162921667099, |
|
"rewards/improved_len_reward_dast": 0.4787489101290703, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 2018.9642028808594, |
|
"epoch": 0.6377187297472456, |
|
"grad_norm": 0.23377044647625822, |
|
"kl": 0.01549530029296875, |
|
"learning_rate": 4.1327594987373347e-07, |
|
"loss": 0.0057, |
|
"reward": 0.9710913375020027, |
|
"reward_std": 0.4150635525584221, |
|
"rewards/accuracy_reward": 0.6479591578245163, |
|
"rewards/improved_len_reward_dast": 0.3231321321800351, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 1953.44384765625, |
|
"epoch": 0.6403110823071938, |
|
"grad_norm": 0.18922091960973522, |
|
"kl": 0.0152740478515625, |
|
"learning_rate": 4.0938890952673443e-07, |
|
"loss": -0.0073, |
|
"reward": 1.144493117928505, |
|
"reward_std": 0.326381828635931, |
|
"rewards/accuracy_reward": 0.6989795714616776, |
|
"rewards/improved_len_reward_dast": 0.445513516664505, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 1779.9234771728516, |
|
"epoch": 0.642903434867142, |
|
"grad_norm": 0.19009690153217312, |
|
"kl": 0.01587677001953125, |
|
"learning_rate": 4.05513461333693e-07, |
|
"loss": 0.0056, |
|
"reward": 1.2144882082939148, |
|
"reward_std": 0.3660648465156555, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.45428410917520523, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1680.5816040039062, |
|
"epoch": 0.6454957874270901, |
|
"grad_norm": 0.18737871436935236, |
|
"kl": 0.01519775390625, |
|
"learning_rate": 4.016499247913994e-07, |
|
"loss": 0.0155, |
|
"reward": 1.228882908821106, |
|
"reward_std": 0.42849814891815186, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.4584747403860092, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1700.0765075683594, |
|
"epoch": 0.6480881399870383, |
|
"grad_norm": 0.19083582747427946, |
|
"kl": 0.01373291015625, |
|
"learning_rate": 3.977986184146328e-07, |
|
"loss": 0.0276, |
|
"reward": 1.4491282403469086, |
|
"reward_std": 0.29963432252407074, |
|
"rewards/accuracy_reward": 0.8469387590885162, |
|
"rewards/improved_len_reward_dast": 0.6021894812583923, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 1699.5050964355469, |
|
"epoch": 0.6506804925469863, |
|
"grad_norm": 0.18294974628895902, |
|
"kl": 0.01318359375, |
|
"learning_rate": 3.939598597099022e-07, |
|
"loss": -0.0028, |
|
"reward": 1.1291119307279587, |
|
"reward_std": 0.4640827924013138, |
|
"rewards/accuracy_reward": 0.7499999850988388, |
|
"rewards/improved_len_reward_dast": 0.3791119046509266, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1555.9489135742188, |
|
"epoch": 0.6532728451069345, |
|
"grad_norm": 0.2987585035266382, |
|
"kl": 0.013702392578125, |
|
"learning_rate": 3.9013396514927076e-07, |
|
"loss": -0.0182, |
|
"reward": 1.2567480206489563, |
|
"reward_std": 0.38375869020819664, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.4710337221622467, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 2022.5509643554688, |
|
"epoch": 0.6558651976668827, |
|
"grad_norm": 0.16778625708063813, |
|
"kl": 0.0160064697265625, |
|
"learning_rate": 3.8632125014426566e-07, |
|
"loss": 0.0026, |
|
"reward": 1.0748438835144043, |
|
"reward_std": 0.3207223527133465, |
|
"rewards/accuracy_reward": 0.6836734712123871, |
|
"rewards/improved_len_reward_dast": 0.3911704570055008, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 2008.7550659179688, |
|
"epoch": 0.6584575502268308, |
|
"grad_norm": 0.20081517128616475, |
|
"kl": 0.017364501953125, |
|
"learning_rate": 3.8252202901987474e-07, |
|
"loss": -0.0036, |
|
"reward": 1.1095408350229263, |
|
"reward_std": 0.42732013761997223, |
|
"rewards/accuracy_reward": 0.7193877398967743, |
|
"rewards/improved_len_reward_dast": 0.39015308022499084, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 1753.5305786132812, |
|
"epoch": 0.661049902786779, |
|
"grad_norm": 0.19286213527020518, |
|
"kl": 0.015838623046875, |
|
"learning_rate": 3.7873661498863384e-07, |
|
"loss": -0.0193, |
|
"reward": 1.3401989042758942, |
|
"reward_std": 0.44482723623514175, |
|
"rewards/accuracy_reward": 0.8367346823215485, |
|
"rewards/improved_len_reward_dast": 0.5034642219543457, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 1714.8316040039062, |
|
"epoch": 0.6636422553467272, |
|
"grad_norm": 0.19098352531749854, |
|
"kl": 0.015716552734375, |
|
"learning_rate": 3.7496532012480463e-07, |
|
"loss": -0.0172, |
|
"reward": 1.285597413778305, |
|
"reward_std": 0.3779995068907738, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.4947810471057892, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 1587.0254821777344, |
|
"epoch": 0.6662346079066753, |
|
"grad_norm": 0.1828164836366847, |
|
"kl": 0.01513671875, |
|
"learning_rate": 3.7120845533864706e-07, |
|
"loss": 0.0165, |
|
"reward": 1.2909784018993378, |
|
"reward_std": 0.3537175990641117, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.5001621246337891, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 1945.8519897460938, |
|
"epoch": 0.6688269604666235, |
|
"grad_norm": 0.2401064586242113, |
|
"kl": 0.018310546875, |
|
"learning_rate": 3.6746633035078723e-07, |
|
"loss": -0.0254, |
|
"reward": 0.9318393021821976, |
|
"reward_std": 0.3634992204606533, |
|
"rewards/accuracy_reward": 0.6530612260103226, |
|
"rewards/improved_len_reward_dast": 0.2787781246006489, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 1464.5356903076172, |
|
"epoch": 0.6714193130265717, |
|
"grad_norm": 0.19897550034047456, |
|
"kl": 0.0117645263671875, |
|
"learning_rate": 3.63739253666684e-07, |
|
"loss": 0.0257, |
|
"reward": 1.3326016068458557, |
|
"reward_std": 0.25891564041376114, |
|
"rewards/accuracy_reward": 0.8469387590885162, |
|
"rewards/improved_len_reward_dast": 0.48566286638379097, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 2040.6173095703125, |
|
"epoch": 0.6740116655865198, |
|
"grad_norm": 0.2093225075876704, |
|
"kl": 0.01587677001953125, |
|
"learning_rate": 3.6002753255119533e-07, |
|
"loss": 0.0446, |
|
"reward": 1.1549495160579681, |
|
"reward_std": 0.6060752719640732, |
|
"rewards/accuracy_reward": 0.7295918166637421, |
|
"rewards/improved_len_reward_dast": 0.42535772174596786, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1504.892837524414, |
|
"epoch": 0.6766040181464679, |
|
"grad_norm": 0.2413238757963301, |
|
"kl": 0.013092041015625, |
|
"learning_rate": 3.5633147300324706e-07, |
|
"loss": 0.039, |
|
"reward": 1.3253722488880157, |
|
"reward_std": 0.22303567081689835, |
|
"rewards/accuracy_reward": 0.7755101919174194, |
|
"rewards/improved_len_reward_dast": 0.5498620271682739, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 1835.6020202636719, |
|
"epoch": 0.679196370706416, |
|
"grad_norm": 0.1742605810963208, |
|
"kl": 0.0152587890625, |
|
"learning_rate": 3.526513797306051e-07, |
|
"loss": 0.023, |
|
"reward": 1.3810910284519196, |
|
"reward_std": 0.3878571353852749, |
|
"rewards/accuracy_reward": 0.8469387590885162, |
|
"rewards/improved_len_reward_dast": 0.5341522693634033, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 1934.44384765625, |
|
"epoch": 0.6817887232663642, |
|
"grad_norm": 0.18402016017590034, |
|
"kl": 0.0189971923828125, |
|
"learning_rate": 3.489875561247568e-07, |
|
"loss": 0.0326, |
|
"reward": 1.1064758449792862, |
|
"reward_std": 0.5427646264433861, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/improved_len_reward_dast": 0.3564758636057377, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 1527.6479187011719, |
|
"epoch": 0.6843810758263124, |
|
"grad_norm": 0.2535051321853217, |
|
"kl": 0.0133209228515625, |
|
"learning_rate": 3.453403042358968e-07, |
|
"loss": 0.0594, |
|
"reward": 1.3837721645832062, |
|
"reward_std": 0.3384307250380516, |
|
"rewards/accuracy_reward": 0.8571428507566452, |
|
"rewards/improved_len_reward_dast": 0.5266292989253998, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 1750.1275329589844, |
|
"epoch": 0.6869734283862605, |
|
"grad_norm": 0.20005193883523226, |
|
"kl": 0.014312744140625, |
|
"learning_rate": 3.417099247480277e-07, |
|
"loss": 0.0069, |
|
"reward": 1.1163494735956192, |
|
"reward_std": 0.4810503050684929, |
|
"rewards/accuracy_reward": 0.7295918166637421, |
|
"rewards/improved_len_reward_dast": 0.3867576252669096, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 1910.5254821777344, |
|
"epoch": 0.6895657809462087, |
|
"grad_norm": 0.3018048627256463, |
|
"kl": 0.0156402587890625, |
|
"learning_rate": 3.3809671695416916e-07, |
|
"loss": 0.0357, |
|
"reward": 1.147754654288292, |
|
"reward_std": 0.5025169178843498, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.3824485056102276, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 1284.0663146972656, |
|
"epoch": 0.6921581335061568, |
|
"grad_norm": 0.18258330323366856, |
|
"kl": 0.0092926025390625, |
|
"learning_rate": 3.345009787316859e-07, |
|
"loss": 0.0015, |
|
"reward": 1.4202894866466522, |
|
"reward_std": 0.2870555892586708, |
|
"rewards/accuracy_reward": 0.8418367058038712, |
|
"rewards/improved_len_reward_dast": 0.5784527361392975, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 1557.5612030029297, |
|
"epoch": 0.694750486066105, |
|
"grad_norm": 0.1849700340313966, |
|
"kl": 0.012725830078125, |
|
"learning_rate": 3.309230065177289e-07, |
|
"loss": -0.0079, |
|
"reward": 1.4877441823482513, |
|
"reward_std": 0.302555400878191, |
|
"rewards/accuracy_reward": 0.8622448742389679, |
|
"rewards/improved_len_reward_dast": 0.6254993677139282, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 1482.5203552246094, |
|
"epoch": 0.6973428386260532, |
|
"grad_norm": 0.19171071001803489, |
|
"kl": 0.0144500732421875, |
|
"learning_rate": 3.273630952847971e-07, |
|
"loss": -0.0012, |
|
"reward": 1.2047373950481415, |
|
"reward_std": 0.48537394404411316, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.4445333182811737, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 1744.6070861816406, |
|
"epoch": 0.6999351911860013, |
|
"grad_norm": 0.17132128213246742, |
|
"kl": 0.01513671875, |
|
"learning_rate": 3.2382153851641996e-07, |
|
"loss": 0.0229, |
|
"reward": 1.1097373962402344, |
|
"reward_std": 0.2911606300622225, |
|
"rewards/accuracy_reward": 0.7295918166637421, |
|
"rewards/improved_len_reward_dast": 0.38014551997184753, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1705.5968933105469, |
|
"epoch": 0.7025275437459495, |
|
"grad_norm": 0.2582533948663525, |
|
"kl": 0.01708984375, |
|
"learning_rate": 3.202986281829616e-07, |
|
"loss": 0.045, |
|
"reward": 1.3047520220279694, |
|
"reward_std": 0.4435114786028862, |
|
"rewards/accuracy_reward": 0.8061224222183228, |
|
"rewards/improved_len_reward_dast": 0.4986295886337757, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 1806.591796875, |
|
"epoch": 0.7051198963058976, |
|
"grad_norm": 0.17993615347196873, |
|
"kl": 0.01581573486328125, |
|
"learning_rate": 3.1679465471755106e-07, |
|
"loss": 0.016, |
|
"reward": 1.2005809843540192, |
|
"reward_std": 0.2893667705357075, |
|
"rewards/accuracy_reward": 0.7448979467153549, |
|
"rewards/improved_len_reward_dast": 0.45568302273750305, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 1960.2244262695312, |
|
"epoch": 0.7077122488658457, |
|
"grad_norm": 0.21394731890393012, |
|
"kl": 0.018402099609375, |
|
"learning_rate": 3.1330990699213824e-07, |
|
"loss": 0.0026, |
|
"reward": 1.3150149285793304, |
|
"reward_std": 0.32834067940711975, |
|
"rewards/accuracy_reward": 0.7602040469646454, |
|
"rewards/improved_len_reward_dast": 0.5548108592629433, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 1648.7601623535156, |
|
"epoch": 0.7103046014257939, |
|
"grad_norm": 0.22677843577967902, |
|
"kl": 0.0144500732421875, |
|
"learning_rate": 3.0984467229367885e-07, |
|
"loss": -0.0289, |
|
"reward": 1.186056673526764, |
|
"reward_std": 0.3048909828066826, |
|
"rewards/accuracy_reward": 0.7653061002492905, |
|
"rewards/improved_len_reward_dast": 0.42075058072805405, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 1631.3876953125, |
|
"epoch": 0.712896953985742, |
|
"grad_norm": 0.18075852179231652, |
|
"kl": 0.0135955810546875, |
|
"learning_rate": 3.063992363004503e-07, |
|
"loss": -0.0047, |
|
"reward": 1.3900758624076843, |
|
"reward_std": 0.35281531512737274, |
|
"rewards/accuracy_reward": 0.8163264989852905, |
|
"rewards/improved_len_reward_dast": 0.5737493187189102, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 1794.5203857421875, |
|
"epoch": 0.7154893065456902, |
|
"grad_norm": 0.20597152512904204, |
|
"kl": 0.0141143798828125, |
|
"learning_rate": 3.0297388305850004e-07, |
|
"loss": 0.0135, |
|
"reward": 1.2308696657419205, |
|
"reward_std": 0.3947853706777096, |
|
"rewards/accuracy_reward": 0.7959183603525162, |
|
"rewards/improved_len_reward_dast": 0.434951264411211, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1608.892822265625, |
|
"epoch": 0.7180816591056384, |
|
"grad_norm": 0.22201185510570046, |
|
"kl": 0.0151519775390625, |
|
"learning_rate": 2.9956889495822877e-07, |
|
"loss": 0.0463, |
|
"reward": 1.3714110851287842, |
|
"reward_std": 0.41973991319537163, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.549982562661171, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 1833.0203552246094, |
|
"epoch": 0.7206740116655865, |
|
"grad_norm": 0.18677648497687657, |
|
"kl": 0.0153656005859375, |
|
"learning_rate": 2.961845527111091e-07, |
|
"loss": 0.0087, |
|
"reward": 1.1960042417049408, |
|
"reward_std": 0.35424697771668434, |
|
"rewards/accuracy_reward": 0.7499999850988388, |
|
"rewards/improved_len_reward_dast": 0.4460042342543602, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1663.1989440917969, |
|
"epoch": 0.7232663642255347, |
|
"grad_norm": 0.23408313686800128, |
|
"kl": 0.0152435302734375, |
|
"learning_rate": 2.9282113532654363e-07, |
|
"loss": 0.0496, |
|
"reward": 1.2954119145870209, |
|
"reward_std": 0.4828920140862465, |
|
"rewards/accuracy_reward": 0.8265306055545807, |
|
"rewards/improved_len_reward_dast": 0.46888134628534317, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 1693.0254974365234, |
|
"epoch": 0.7258587167854829, |
|
"grad_norm": 0.23913668563173046, |
|
"kl": 0.019439697265625, |
|
"learning_rate": 2.894789200888634e-07, |
|
"loss": 0.0174, |
|
"reward": 1.4143796861171722, |
|
"reward_std": 0.37724653631448746, |
|
"rewards/accuracy_reward": 0.8367346674203873, |
|
"rewards/improved_len_reward_dast": 0.5776450335979462, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 1277.8468780517578, |
|
"epoch": 0.728451069345431, |
|
"grad_norm": 0.2694215840510146, |
|
"kl": 0.0134429931640625, |
|
"learning_rate": 2.8615818253446766e-07, |
|
"loss": 0.0046, |
|
"reward": 1.4540930390357971, |
|
"reward_std": 0.3243625983595848, |
|
"rewards/accuracy_reward": 0.8775509893894196, |
|
"rewards/improved_len_reward_dast": 0.5765420496463776, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 1236.0356903076172, |
|
"epoch": 0.7310434219053791, |
|
"grad_norm": 0.1871177689494516, |
|
"kl": 0.0116729736328125, |
|
"learning_rate": 2.828591964291093e-07, |
|
"loss": 0.0055, |
|
"reward": 1.2881307899951935, |
|
"reward_std": 0.42027105391025543, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.466702226549387, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 1389.3673095703125, |
|
"epoch": 0.7336357744653272, |
|
"grad_norm": 0.17949852486745174, |
|
"kl": 0.0106201171875, |
|
"learning_rate": 2.7958223374532363e-07, |
|
"loss": -0.029, |
|
"reward": 1.2979092001914978, |
|
"reward_std": 0.34224472381174564, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.4407663494348526, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 1291.64794921875, |
|
"epoch": 0.7362281270252754, |
|
"grad_norm": 0.20498717449578613, |
|
"kl": 0.01025390625, |
|
"learning_rate": 2.7632756464000835e-07, |
|
"loss": 0.0333, |
|
"reward": 1.6148460805416107, |
|
"reward_std": 0.25412340462207794, |
|
"rewards/accuracy_reward": 0.9234693795442581, |
|
"rewards/improved_len_reward_dast": 0.6913766860961914, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 1941.4284973144531, |
|
"epoch": 0.7388204795852236, |
|
"grad_norm": 0.19896247201933293, |
|
"kl": 0.019378662109375, |
|
"learning_rate": 2.730954574321503e-07, |
|
"loss": 0.0303, |
|
"reward": 1.0792112797498703, |
|
"reward_std": 0.38586486876010895, |
|
"rewards/accuracy_reward": 0.7142857015132904, |
|
"rewards/improved_len_reward_dast": 0.3649255894124508, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 1503.8826446533203, |
|
"epoch": 0.7414128321451717, |
|
"grad_norm": 0.22350544706234096, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 2.698861785807055e-07, |
|
"loss": 0.0311, |
|
"reward": 1.5651328265666962, |
|
"reward_std": 0.3553974963724613, |
|
"rewards/accuracy_reward": 0.9030612260103226, |
|
"rewards/improved_len_reward_dast": 0.6620715856552124, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 1731.8214111328125, |
|
"epoch": 0.7440051847051199, |
|
"grad_norm": 0.23609281842069962, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 2.6669999266263154e-07, |
|
"loss": -0.0306, |
|
"reward": 1.1723814904689789, |
|
"reward_std": 0.5022178217768669, |
|
"rewards/accuracy_reward": 0.7602040767669678, |
|
"rewards/improved_len_reward_dast": 0.41217736527323723, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 1870.0458679199219, |
|
"epoch": 0.7465975372650681, |
|
"grad_norm": 0.15632978700328695, |
|
"kl": 0.0158843994140625, |
|
"learning_rate": 2.635371623510758e-07, |
|
"loss": 0.0204, |
|
"reward": 1.0800221413373947, |
|
"reward_std": 0.2878151945769787, |
|
"rewards/accuracy_reward": 0.6887754872441292, |
|
"rewards/improved_len_reward_dast": 0.39124663546681404, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 1414.2703552246094, |
|
"epoch": 0.7491898898250162, |
|
"grad_norm": 0.23286966119816113, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 2.6039794839372066e-07, |
|
"loss": -0.0074, |
|
"reward": 1.341863602399826, |
|
"reward_std": 0.36198627576231956, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.530639074742794, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 1749.2295532226562, |
|
"epoch": 0.7517822423849644, |
|
"grad_norm": 0.17241966258758817, |
|
"kl": 0.0135955810546875, |
|
"learning_rate": 2.5728260959128614e-07, |
|
"loss": -0.0129, |
|
"reward": 1.2213443964719772, |
|
"reward_std": 0.4387034922838211, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.46114034205675125, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 2126.826446533203, |
|
"epoch": 0.7543745949449125, |
|
"grad_norm": 0.2030042278234921, |
|
"kl": 0.018890380859375, |
|
"learning_rate": 2.541914027761951e-07, |
|
"loss": 0.0435, |
|
"reward": 1.1566181033849716, |
|
"reward_std": 0.505137488245964, |
|
"rewards/accuracy_reward": 0.7244897782802582, |
|
"rewards/improved_len_reward_dast": 0.43212827295064926, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 1632.0713653564453, |
|
"epoch": 0.7569669475048607, |
|
"grad_norm": 0.24718377241844533, |
|
"kl": 0.016876220703125, |
|
"learning_rate": 2.511245827913991e-07, |
|
"loss": 0.0421, |
|
"reward": 1.2267541885375977, |
|
"reward_std": 0.3394501358270645, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.4563460126519203, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 1807.6529846191406, |
|
"epoch": 0.7595593000648088, |
|
"grad_norm": 0.1861047697263272, |
|
"kl": 0.01556396484375, |
|
"learning_rate": 2.4808240246936866e-07, |
|
"loss": -0.0078, |
|
"reward": 1.2387667298316956, |
|
"reward_std": 0.4819525480270386, |
|
"rewards/accuracy_reward": 0.795918345451355, |
|
"rewards/improved_len_reward_dast": 0.44284842535853386, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 1847.19384765625, |
|
"epoch": 0.7621516526247569, |
|
"grad_norm": 0.22670935044930915, |
|
"kl": 0.018310546875, |
|
"learning_rate": 2.450651126112504e-07, |
|
"loss": 0.0266, |
|
"reward": 1.4322427809238434, |
|
"reward_std": 0.2754583992063999, |
|
"rewards/accuracy_reward": 0.8418367058038712, |
|
"rewards/improved_len_reward_dast": 0.590406060218811, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 1595.9795532226562, |
|
"epoch": 0.7647440051847051, |
|
"grad_norm": 0.20527730505286215, |
|
"kl": 0.015838623046875, |
|
"learning_rate": 2.4207296196618924e-07, |
|
"loss": 0.0242, |
|
"reward": 1.3626587092876434, |
|
"reward_std": 0.32539451494812965, |
|
"rewards/accuracy_reward": 0.7908162921667099, |
|
"rewards/improved_len_reward_dast": 0.5718424171209335, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 1054.137710571289, |
|
"epoch": 0.7673363577446533, |
|
"grad_norm": 0.21493362850187817, |
|
"kl": 0.0093536376953125, |
|
"learning_rate": 2.3910619721082253e-07, |
|
"loss": 0.0196, |
|
"reward": 1.4152240753173828, |
|
"reward_std": 0.35989922285079956, |
|
"rewards/accuracy_reward": 0.867346927523613, |
|
"rewards/improved_len_reward_dast": 0.5478771775960922, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 1474.3367004394531, |
|
"epoch": 0.7699287103046014, |
|
"grad_norm": 0.20358206304391516, |
|
"kl": 0.0144500732421875, |
|
"learning_rate": 2.3616506292894282e-07, |
|
"loss": 0.0271, |
|
"reward": 1.4626062214374542, |
|
"reward_std": 0.29278943687677383, |
|
"rewards/accuracy_reward": 0.8775510042905807, |
|
"rewards/improved_len_reward_dast": 0.5850552245974541, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 1752.2295227050781, |
|
"epoch": 0.7725210628645496, |
|
"grad_norm": 0.1833066106969091, |
|
"kl": 0.015289306640625, |
|
"learning_rate": 2.332498015913344e-07, |
|
"loss": 0.0009, |
|
"reward": 1.3457911014556885, |
|
"reward_std": 0.2773626856505871, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.5345666632056236, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 1325.688720703125, |
|
"epoch": 0.7751134154244977, |
|
"grad_norm": 0.19517765602950424, |
|
"kl": 0.01210784912109375, |
|
"learning_rate": 2.303606535357843e-07, |
|
"loss": 0.0599, |
|
"reward": 1.5037426948547363, |
|
"reward_std": 0.26091703958809376, |
|
"rewards/accuracy_reward": 0.8775510191917419, |
|
"rewards/improved_len_reward_dast": 0.6261917278170586, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 1663.0662689208984, |
|
"epoch": 0.7777057679844459, |
|
"grad_norm": 0.20601240191104908, |
|
"kl": 0.01605224609375, |
|
"learning_rate": 2.2749785694726685e-07, |
|
"loss": 0.0094, |
|
"reward": 1.3560754358768463, |
|
"reward_std": 0.37762896716594696, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.5346468687057495, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 1426.6173095703125, |
|
"epoch": 0.7802981205443941, |
|
"grad_norm": 0.20108821286385423, |
|
"kl": 0.0143585205078125, |
|
"learning_rate": 2.2466164783830972e-07, |
|
"loss": 0.0207, |
|
"reward": 1.3399082869291306, |
|
"reward_std": 0.3976980447769165, |
|
"rewards/accuracy_reward": 0.806122437119484, |
|
"rewards/improved_len_reward_dast": 0.5337858349084854, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 1790.8978881835938, |
|
"epoch": 0.7828904731043422, |
|
"grad_norm": 0.21383459811515595, |
|
"kl": 0.0155029296875, |
|
"learning_rate": 2.2185226002953483e-07, |
|
"loss": 0.0004, |
|
"reward": 1.2710506618022919, |
|
"reward_std": 0.3618534617125988, |
|
"rewards/accuracy_reward": 0.785714253783226, |
|
"rewards/improved_len_reward_dast": 0.4853363707661629, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 1939.8775024414062, |
|
"epoch": 0.7854828256642904, |
|
"grad_norm": 0.29379980912133363, |
|
"kl": 0.01885986328125, |
|
"learning_rate": 2.1906992513038268e-07, |
|
"loss": 0.0479, |
|
"reward": 1.2805213034152985, |
|
"reward_std": 0.4143086224794388, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.4692968502640724, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 1614.3775329589844, |
|
"epoch": 0.7880751782242384, |
|
"grad_norm": 0.17729210448855, |
|
"kl": 0.0162353515625, |
|
"learning_rate": 2.1631487252001822e-07, |
|
"loss": 0.0049, |
|
"reward": 1.234568029642105, |
|
"reward_std": 0.417904369533062, |
|
"rewards/accuracy_reward": 0.8010203838348389, |
|
"rewards/improved_len_reward_dast": 0.43354763835668564, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 2287.780548095703, |
|
"epoch": 0.7906675307841866, |
|
"grad_norm": 1.2242934021255432, |
|
"kl": 0.021087646484375, |
|
"learning_rate": 2.1358732932842032e-07, |
|
"loss": 0.0211, |
|
"reward": 1.0315402448177338, |
|
"reward_std": 0.36217188835144043, |
|
"rewards/accuracy_reward": 0.6581632494926453, |
|
"rewards/improved_len_reward_dast": 0.3733769580721855, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 1723.3673400878906, |
|
"epoch": 0.7932598833441348, |
|
"grad_norm": 0.20686736211065535, |
|
"kl": 0.015533447265625, |
|
"learning_rate": 2.1088752041765734e-07, |
|
"loss": 0.0319, |
|
"reward": 1.3500191867351532, |
|
"reward_std": 0.3599831163883209, |
|
"rewards/accuracy_reward": 0.8061224222183228, |
|
"rewards/improved_len_reward_dast": 0.5438967421650887, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 1528.9183654785156, |
|
"epoch": 0.7958522359040829, |
|
"grad_norm": 0.21573348295043995, |
|
"kl": 0.015960693359375, |
|
"learning_rate": 2.0821566836334847e-07, |
|
"loss": -0.0098, |
|
"reward": 1.3639625310897827, |
|
"reward_std": 0.3467046692967415, |
|
"rewards/accuracy_reward": 0.8469387590885162, |
|
"rewards/improved_len_reward_dast": 0.5170237571001053, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 1429.280532836914, |
|
"epoch": 0.7984445884640311, |
|
"grad_norm": 0.18304725042811948, |
|
"kl": 0.01262664794921875, |
|
"learning_rate": 2.0557199343631494e-07, |
|
"loss": 0.0087, |
|
"reward": 1.2729185968637466, |
|
"reward_std": 0.37279824167490005, |
|
"rewards/accuracy_reward": 0.8061224520206451, |
|
"rewards/improved_len_reward_dast": 0.4667961820960045, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 1876.0458679199219, |
|
"epoch": 0.8010369410239793, |
|
"grad_norm": 0.20278131778947003, |
|
"kl": 0.01853179931640625, |
|
"learning_rate": 2.0295671358442033e-07, |
|
"loss": 0.019, |
|
"reward": 1.3648760467767715, |
|
"reward_std": 0.3640540838241577, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.5536516159772873, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 1463.239730834961, |
|
"epoch": 0.8036292935839274, |
|
"grad_norm": 0.22793846718497435, |
|
"kl": 0.014312744140625, |
|
"learning_rate": 2.0037004441460263e-07, |
|
"loss": 0.0287, |
|
"reward": 1.3905141055583954, |
|
"reward_std": 0.41797252371907234, |
|
"rewards/accuracy_reward": 0.8418367207050323, |
|
"rewards/improved_len_reward_dast": 0.5486774370074272, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 1581.4999542236328, |
|
"epoch": 0.8062216461438756, |
|
"grad_norm": 0.2080094216762287, |
|
"kl": 0.01576995849609375, |
|
"learning_rate": 1.9781219917509987e-07, |
|
"loss": 0.0138, |
|
"reward": 1.4025911092758179, |
|
"reward_std": 0.3261520601809025, |
|
"rewards/accuracy_reward": 0.8265306055545807, |
|
"rewards/improved_len_reward_dast": 0.5760605186223984, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 1737.1019897460938, |
|
"epoch": 0.8088139987038238, |
|
"grad_norm": 0.22193491426249878, |
|
"kl": 0.0164794921875, |
|
"learning_rate": 1.9528338873786882e-07, |
|
"loss": 0.0217, |
|
"reward": 1.1316132843494415, |
|
"reward_std": 0.44266829639673233, |
|
"rewards/accuracy_reward": 0.7397959157824516, |
|
"rewards/improved_len_reward_dast": 0.39181735552847385, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 1681.6224060058594, |
|
"epoch": 0.8114063512637719, |
|
"grad_norm": 0.21692033379747663, |
|
"kl": 0.0162506103515625, |
|
"learning_rate": 1.9278382158120116e-07, |
|
"loss": 0.0256, |
|
"reward": 1.2757752537727356, |
|
"reward_std": 0.447167094796896, |
|
"rewards/accuracy_reward": 0.795918345451355, |
|
"rewards/improved_len_reward_dast": 0.4798569083213806, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 1513.8316040039062, |
|
"epoch": 0.81399870382372, |
|
"grad_norm": 0.18130741669805844, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 1.9031370377253574e-07, |
|
"loss": 0.0246, |
|
"reward": 1.535945862531662, |
|
"reward_std": 0.31188252195715904, |
|
"rewards/accuracy_reward": 0.8826530426740646, |
|
"rewards/improved_len_reward_dast": 0.653292790055275, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 1734.6632385253906, |
|
"epoch": 0.8165910563836681, |
|
"grad_norm": 0.18939277983218827, |
|
"kl": 0.0179443359375, |
|
"learning_rate": 1.8787323895147052e-07, |
|
"loss": -0.001, |
|
"reward": 1.1586688458919525, |
|
"reward_std": 0.4217538684606552, |
|
"rewards/accuracy_reward": 0.7551020234823227, |
|
"rewards/improved_len_reward_dast": 0.4035668522119522, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 1650.4846496582031, |
|
"epoch": 0.8191834089436163, |
|
"grad_norm": 0.2171448495391751, |
|
"kl": 0.0167999267578125, |
|
"learning_rate": 1.8546262831297438e-07, |
|
"loss": -0.0121, |
|
"reward": 1.464043915271759, |
|
"reward_std": 0.3952450007200241, |
|
"rewards/accuracy_reward": 0.8724489510059357, |
|
"rewards/improved_len_reward_dast": 0.5915949791669846, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 1495.3316040039062, |
|
"epoch": 0.8217757615035645, |
|
"grad_norm": 0.19836205451789388, |
|
"kl": 0.0137481689453125, |
|
"learning_rate": 1.8308207059079938e-07, |
|
"loss": -0.0069, |
|
"reward": 1.1547789573669434, |
|
"reward_std": 0.41507500410079956, |
|
"rewards/accuracy_reward": 0.7704081386327744, |
|
"rewards/improved_len_reward_dast": 0.3843708522617817, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 1517.8367004394531, |
|
"epoch": 0.8243681140635126, |
|
"grad_norm": 0.20600261332668526, |
|
"kl": 0.0160064697265625, |
|
"learning_rate": 1.8073176204109837e-07, |
|
"loss": 0.0437, |
|
"reward": 1.438821941614151, |
|
"reward_std": 0.306551206856966, |
|
"rewards/accuracy_reward": 0.8775510042905807, |
|
"rewards/improved_len_reward_dast": 0.5612709149718285, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 1504.4285278320312, |
|
"epoch": 0.8269604666234608, |
|
"grad_norm": 0.21261278084781152, |
|
"kl": 0.014495849609375, |
|
"learning_rate": 1.7841189642624428e-07, |
|
"loss": 0.0231, |
|
"reward": 1.229389488697052, |
|
"reward_std": 0.4350128807127476, |
|
"rewards/accuracy_reward": 0.7959183603525162, |
|
"rewards/improved_len_reward_dast": 0.4334711404517293, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 1672.8316040039062, |
|
"epoch": 0.829552819183409, |
|
"grad_norm": 0.1943882700904058, |
|
"kl": 0.0173492431640625, |
|
"learning_rate": 1.7612266499885642e-07, |
|
"loss": 0.0464, |
|
"reward": 1.5176236629486084, |
|
"reward_std": 0.3366955704987049, |
|
"rewards/accuracy_reward": 0.8877550810575485, |
|
"rewards/improved_len_reward_dast": 0.6298686116933823, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 1179.0713653564453, |
|
"epoch": 0.8321451717433571, |
|
"grad_norm": 0.22615060777330476, |
|
"kl": 0.012054443359375, |
|
"learning_rate": 1.7386425648603354e-07, |
|
"loss": 0.0423, |
|
"reward": 1.5581437051296234, |
|
"reward_std": 0.234028534963727, |
|
"rewards/accuracy_reward": 0.8979591578245163, |
|
"rewards/improved_len_reward_dast": 0.6601845473051071, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 1385.7346649169922, |
|
"epoch": 0.8347375243033053, |
|
"grad_norm": 0.18647668905538498, |
|
"kl": 0.0132293701171875, |
|
"learning_rate": 1.716368570737946e-07, |
|
"loss": -0.0176, |
|
"reward": 1.5387031435966492, |
|
"reward_std": 0.39274929463863373, |
|
"rewards/accuracy_reward": 0.9081632643938065, |
|
"rewards/improved_len_reward_dast": 0.6305398866534233, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 1955.0357055664062, |
|
"epoch": 0.8373298768632534, |
|
"grad_norm": 0.1871384863519405, |
|
"kl": 0.01862335205078125, |
|
"learning_rate": 1.6944065039173004e-07, |
|
"loss": 0.0282, |
|
"reward": 0.9992491155862808, |
|
"reward_std": 0.4749828167259693, |
|
"rewards/accuracy_reward": 0.6785714030265808, |
|
"rewards/improved_len_reward_dast": 0.3206777200102806, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 1949.9693298339844, |
|
"epoch": 0.8399222294232016, |
|
"grad_norm": 0.20078422959231634, |
|
"kl": 0.020111083984375, |
|
"learning_rate": 1.672758174978622e-07, |
|
"loss": 0.0315, |
|
"reward": 1.227005422115326, |
|
"reward_std": 0.36194342374801636, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.46680130809545517, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 1403.64794921875, |
|
"epoch": 0.8425145819831497, |
|
"grad_norm": 0.20565437549884577, |
|
"kl": 0.0128936767578125, |
|
"learning_rate": 1.6514253686371917e-07, |
|
"loss": 0.0204, |
|
"reward": 1.4708826392889023, |
|
"reward_std": 0.2500988617539406, |
|
"rewards/accuracy_reward": 0.8826530426740646, |
|
"rewards/improved_len_reward_dast": 0.5882296115159988, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 1667.8264770507812, |
|
"epoch": 0.8451069345430978, |
|
"grad_norm": 0.21813136540877595, |
|
"kl": 0.0157318115234375, |
|
"learning_rate": 1.630409843596216e-07, |
|
"loss": 0.0307, |
|
"reward": 1.3411798775196075, |
|
"reward_std": 0.32134104520082474, |
|
"rewards/accuracy_reward": 0.8061224222183228, |
|
"rewards/improved_len_reward_dast": 0.53505739569664, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 1616.2908020019531, |
|
"epoch": 0.847699287103046, |
|
"grad_norm": 0.1969183257495155, |
|
"kl": 0.0156402587890625, |
|
"learning_rate": 1.609713332401831e-07, |
|
"loss": 0.0085, |
|
"reward": 1.2519380450248718, |
|
"reward_std": 0.458795890212059, |
|
"rewards/accuracy_reward": 0.7806122452020645, |
|
"rewards/improved_len_reward_dast": 0.4713258519768715, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 1625.6377258300781, |
|
"epoch": 0.8502916396629941, |
|
"grad_norm": 0.24417535965250406, |
|
"kl": 0.0139617919921875, |
|
"learning_rate": 1.5893375413002765e-07, |
|
"loss": -0.0317, |
|
"reward": 1.2513196468353271, |
|
"reward_std": 0.47703811526298523, |
|
"rewards/accuracy_reward": 0.7704081386327744, |
|
"rewards/improved_len_reward_dast": 0.4809115380048752, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 2058.948944091797, |
|
"epoch": 0.8528839922229423, |
|
"grad_norm": 0.19451912015501954, |
|
"kl": 0.0210418701171875, |
|
"learning_rate": 1.569284150097226e-07, |
|
"loss": 0.0377, |
|
"reward": 1.2445521801710129, |
|
"reward_std": 0.26459160074591637, |
|
"rewards/accuracy_reward": 0.7295918315649033, |
|
"rewards/improved_len_reward_dast": 0.5149602852761745, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 1789.7040405273438, |
|
"epoch": 0.8554763447828905, |
|
"grad_norm": 0.24266903278771249, |
|
"kl": 0.019378662109375, |
|
"learning_rate": 1.5495548120193003e-07, |
|
"loss": 0.0434, |
|
"reward": 1.322462946176529, |
|
"reward_std": 0.38080430775880814, |
|
"rewards/accuracy_reward": 0.8265305906534195, |
|
"rewards/improved_len_reward_dast": 0.49593234062194824, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 1468.8213653564453, |
|
"epoch": 0.8580686973428386, |
|
"grad_norm": 0.1945755306885796, |
|
"kl": 0.01294708251953125, |
|
"learning_rate": 1.5301511535777784e-07, |
|
"loss": 0.0302, |
|
"reward": 1.5070666372776031, |
|
"reward_std": 0.3562978059053421, |
|
"rewards/accuracy_reward": 0.8724489510059357, |
|
"rewards/improved_len_reward_dast": 0.6346177160739899, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 1581.3825988769531, |
|
"epoch": 0.8606610499027868, |
|
"grad_norm": 0.29272858693831433, |
|
"kl": 0.01812744140625, |
|
"learning_rate": 1.5110747744345006e-07, |
|
"loss": 0.0122, |
|
"reward": 1.3418152332305908, |
|
"reward_std": 0.4640466570854187, |
|
"rewards/accuracy_reward": 0.8724489659070969, |
|
"rewards/improved_len_reward_dast": 0.46936625242233276, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 1786.1734313964844, |
|
"epoch": 0.863253402462735, |
|
"grad_norm": 0.19480551857525122, |
|
"kl": 0.019775390625, |
|
"learning_rate": 1.4923272472699986e-07, |
|
"loss": -0.0042, |
|
"reward": 1.1590133309364319, |
|
"reward_std": 0.2618263028562069, |
|
"rewards/accuracy_reward": 0.7193877398967743, |
|
"rewards/improved_len_reward_dast": 0.4396255351603031, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 1171.147933959961, |
|
"epoch": 0.8658457550226831, |
|
"grad_norm": 0.23814232802014945, |
|
"kl": 0.013671875, |
|
"learning_rate": 1.4739101176538274e-07, |
|
"loss": 0.0174, |
|
"reward": 1.2705652117729187, |
|
"reward_std": 0.3895917683839798, |
|
"rewards/accuracy_reward": 0.8367346823215485, |
|
"rewards/improved_len_reward_dast": 0.43383053690195084, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 1758.0816040039062, |
|
"epoch": 0.8684381075826313, |
|
"grad_norm": 0.22764969968005389, |
|
"kl": 0.0219268798828125, |
|
"learning_rate": 1.4558249039171639e-07, |
|
"loss": 0.0414, |
|
"reward": 1.358829528093338, |
|
"reward_std": 0.38345643877983093, |
|
"rewards/accuracy_reward": 0.8367346823215485, |
|
"rewards/improved_len_reward_dast": 0.5220948457717896, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 1889.0509948730469, |
|
"epoch": 0.8710304601425793, |
|
"grad_norm": 0.22895792507657853, |
|
"kl": 0.021484375, |
|
"learning_rate": 1.4380730970276195e-07, |
|
"loss": 0.0354, |
|
"reward": 1.07760888338089, |
|
"reward_std": 0.3665538318455219, |
|
"rewards/accuracy_reward": 0.6887754797935486, |
|
"rewards/improved_len_reward_dast": 0.3888333588838577, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 2373.249969482422, |
|
"epoch": 0.8736228127025275, |
|
"grad_norm": 0.2697468121522664, |
|
"kl": 0.026397705078125, |
|
"learning_rate": 1.420656160466333e-07, |
|
"loss": -0.0102, |
|
"reward": 1.0278730392456055, |
|
"reward_std": 0.348503515124321, |
|
"rewards/accuracy_reward": 0.6938775330781937, |
|
"rewards/improved_len_reward_dast": 0.33399548195302486, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 1981.8978881835938, |
|
"epoch": 0.8762151652624757, |
|
"grad_norm": 0.20587316419649823, |
|
"kl": 0.0223846435546875, |
|
"learning_rate": 1.4035755301073102e-07, |
|
"loss": 0.0273, |
|
"reward": 1.2939772605895996, |
|
"reward_std": 0.46924955397844315, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.5286711901426315, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 1536.4336395263672, |
|
"epoch": 0.8788075178224238, |
|
"grad_norm": 0.20611627730954438, |
|
"kl": 0.0202789306640625, |
|
"learning_rate": 1.386832614099056e-07, |
|
"loss": 0.006, |
|
"reward": 1.4531451165676117, |
|
"reward_std": 0.3475269414484501, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.5960022807121277, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 1489.7652435302734, |
|
"epoch": 0.881399870382372, |
|
"grad_norm": 0.2223037836334228, |
|
"kl": 0.0159454345703125, |
|
"learning_rate": 1.3704287927484846e-07, |
|
"loss": -0.0138, |
|
"reward": 1.3403507471084595, |
|
"reward_std": 0.46086446195840836, |
|
"rewards/accuracy_reward": 0.8112244606018066, |
|
"rewards/improved_len_reward_dast": 0.529126301407814, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 1788.7091674804688, |
|
"epoch": 0.8839922229423202, |
|
"grad_norm": 0.188880858513302, |
|
"kl": 0.0198516845703125, |
|
"learning_rate": 1.3543654184071186e-07, |
|
"loss": 0.0144, |
|
"reward": 1.320367306470871, |
|
"reward_std": 0.2726456895470619, |
|
"rewards/accuracy_reward": 0.7755101919174194, |
|
"rewards/improved_len_reward_dast": 0.5448571220040321, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 1541.3316192626953, |
|
"epoch": 0.8865845755022683, |
|
"grad_norm": 0.20649364949795315, |
|
"kl": 0.01570892333984375, |
|
"learning_rate": 1.3386438153596067e-07, |
|
"loss": 0.0104, |
|
"reward": 1.327652782201767, |
|
"reward_std": 0.3968999646604061, |
|
"rewards/accuracy_reward": 0.846938744187355, |
|
"rewards/improved_len_reward_dast": 0.4807140678167343, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 1504.8775329589844, |
|
"epoch": 0.8891769280622165, |
|
"grad_norm": 0.23748978746970162, |
|
"kl": 0.0181427001953125, |
|
"learning_rate": 1.323265279714543e-07, |
|
"loss": -0.0172, |
|
"reward": 1.3229451477527618, |
|
"reward_std": 0.38034195080399513, |
|
"rewards/accuracy_reward": 0.8265306055545807, |
|
"rewards/improved_len_reward_dast": 0.49641457200050354, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 1616.14794921875, |
|
"epoch": 0.8917692806221647, |
|
"grad_norm": 0.228900632017236, |
|
"kl": 0.020263671875, |
|
"learning_rate": 1.3082310792976202e-07, |
|
"loss": 0.0331, |
|
"reward": 1.4383951127529144, |
|
"reward_std": 0.32518207281827927, |
|
"rewards/accuracy_reward": 0.8520407974720001, |
|
"rewards/improved_len_reward_dast": 0.5863542854785919, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 1765.0509948730469, |
|
"epoch": 0.8943616331821128, |
|
"grad_norm": 0.21689615981919957, |
|
"kl": 0.0205841064453125, |
|
"learning_rate": 1.293542453547102e-07, |
|
"loss": 0.0219, |
|
"reward": 1.3277872800827026, |
|
"reward_std": 0.4930282086133957, |
|
"rewards/accuracy_reward": 0.8163264989852905, |
|
"rewards/improved_len_reward_dast": 0.5114607587456703, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 1576.6071166992188, |
|
"epoch": 0.8969539857420609, |
|
"grad_norm": 0.2503011086919002, |
|
"kl": 0.0197906494140625, |
|
"learning_rate": 1.279200613411642e-07, |
|
"loss": 0.044, |
|
"reward": 1.2905025482177734, |
|
"reward_std": 0.47432298958301544, |
|
"rewards/accuracy_reward": 0.8214285522699356, |
|
"rewards/improved_len_reward_dast": 0.46907395869493484, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 2153.3162231445312, |
|
"epoch": 0.899546338302009, |
|
"grad_norm": 0.23273243697852358, |
|
"kl": 0.023712158203125, |
|
"learning_rate": 1.2652067412504605e-07, |
|
"loss": 0.0312, |
|
"reward": 1.047543928027153, |
|
"reward_std": 0.3953222408890724, |
|
"rewards/accuracy_reward": 0.688775509595871, |
|
"rewards/improved_len_reward_dast": 0.35876838117837906, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 1542.3111877441406, |
|
"epoch": 0.9021386908619572, |
|
"grad_norm": 0.25879665856811085, |
|
"kl": 0.0159149169921875, |
|
"learning_rate": 1.251561990735859e-07, |
|
"loss": 0.0306, |
|
"reward": 1.4665509164333344, |
|
"reward_std": 0.34583452716469765, |
|
"rewards/accuracy_reward": 0.867346927523613, |
|
"rewards/improved_len_reward_dast": 0.599203959107399, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 2166.5713806152344, |
|
"epoch": 0.9047310434219054, |
|
"grad_norm": 0.21742881103681694, |
|
"kl": 0.029144287109375, |
|
"learning_rate": 1.238267486758117e-07, |
|
"loss": 0.0221, |
|
"reward": 0.9765184819698334, |
|
"reward_std": 0.4072360023856163, |
|
"rewards/accuracy_reward": 0.6224489733576775, |
|
"rewards/improved_len_reward_dast": 0.3540695160627365, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 1897.44384765625, |
|
"epoch": 0.9073233959818535, |
|
"grad_norm": 0.20381019828760852, |
|
"kl": 0.022857666015625, |
|
"learning_rate": 1.2253243253327504e-07, |
|
"loss": 0.0392, |
|
"reward": 1.2360577583312988, |
|
"reward_std": 0.4647463858127594, |
|
"rewards/accuracy_reward": 0.7653061151504517, |
|
"rewards/improved_len_reward_dast": 0.470751591026783, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 1563.9234313964844, |
|
"epoch": 0.9099157485418017, |
|
"grad_norm": 0.2149667100915999, |
|
"kl": 0.01705169677734375, |
|
"learning_rate": 1.212733573510154e-07, |
|
"loss": 0.0251, |
|
"reward": 1.484131395816803, |
|
"reward_std": 0.3115840032696724, |
|
"rewards/accuracy_reward": 0.867346927523613, |
|
"rewards/improved_len_reward_dast": 0.6167844533920288, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 1613.438720703125, |
|
"epoch": 0.9125081011017498, |
|
"grad_norm": 0.2397808119710266, |
|
"kl": 0.01849365234375, |
|
"learning_rate": 1.20049626928764e-07, |
|
"loss": 0.0255, |
|
"reward": 1.374268501996994, |
|
"reward_std": 0.3617161624133587, |
|
"rewards/accuracy_reward": 0.8163264989852905, |
|
"rewards/improved_len_reward_dast": 0.5579419583082199, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 1810.8724060058594, |
|
"epoch": 0.915100453661698, |
|
"grad_norm": 0.1952032672447838, |
|
"kl": 0.0240478515625, |
|
"learning_rate": 1.1886134215238539e-07, |
|
"loss": 0.0013, |
|
"reward": 1.2345272898674011, |
|
"reward_std": 0.4293368086218834, |
|
"rewards/accuracy_reward": 0.7602040618658066, |
|
"rewards/improved_len_reward_dast": 0.47432321310043335, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 1323.6071319580078, |
|
"epoch": 0.9176928062216462, |
|
"grad_norm": 0.23544630425662993, |
|
"kl": 0.0150299072265625, |
|
"learning_rate": 1.1770860098556122e-07, |
|
"loss": -0.0126, |
|
"reward": 1.5638253688812256, |
|
"reward_std": 0.3317151963710785, |
|
"rewards/accuracy_reward": 0.9234693795442581, |
|
"rewards/improved_len_reward_dast": 0.6403559893369675, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 1648.1122436523438, |
|
"epoch": 0.9202851587815943, |
|
"grad_norm": 0.19373617697957926, |
|
"kl": 0.01983642578125, |
|
"learning_rate": 1.1659149846171314e-07, |
|
"loss": -0.0106, |
|
"reward": 1.409626692533493, |
|
"reward_std": 0.3634777031838894, |
|
"rewards/accuracy_reward": 0.8112244606018066, |
|
"rewards/improved_len_reward_dast": 0.5984021797776222, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 1640.484634399414, |
|
"epoch": 0.9228775113415425, |
|
"grad_norm": 0.2139648005259324, |
|
"kl": 0.02065277099609375, |
|
"learning_rate": 1.1551012667616889e-07, |
|
"loss": -0.0041, |
|
"reward": 1.3790205717086792, |
|
"reward_std": 0.3004123643040657, |
|
"rewards/accuracy_reward": 0.8010203987360001, |
|
"rewards/improved_len_reward_dast": 0.5780001431703568, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 1952.6427612304688, |
|
"epoch": 0.9254698639014906, |
|
"grad_norm": 0.20207361431898127, |
|
"kl": 0.027069091796875, |
|
"learning_rate": 1.1446457477856933e-07, |
|
"loss": 0.0274, |
|
"reward": 1.1954913437366486, |
|
"reward_std": 0.30133310705423355, |
|
"rewards/accuracy_reward": 0.7448979467153549, |
|
"rewards/improved_len_reward_dast": 0.450593464076519, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 1666.0816040039062, |
|
"epoch": 0.9280622164614387, |
|
"grad_norm": 0.2020263485504787, |
|
"kl": 0.0185546875, |
|
"learning_rate": 1.1345492896551908e-07, |
|
"loss": -0.0157, |
|
"reward": 1.4352505505084991, |
|
"reward_std": 0.4688113033771515, |
|
"rewards/accuracy_reward": 0.8928571343421936, |
|
"rewards/improved_len_reward_dast": 0.542393408715725, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 1809.0611877441406, |
|
"epoch": 0.9306545690213869, |
|
"grad_norm": 0.2096938589768357, |
|
"kl": 0.020904541015625, |
|
"learning_rate": 1.1248127247348025e-07, |
|
"loss": 0.0384, |
|
"reward": 1.3605789840221405, |
|
"reward_std": 0.35709768906235695, |
|
"rewards/accuracy_reward": 0.8163264989852905, |
|
"rewards/improved_len_reward_dast": 0.544252522289753, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 1797.744857788086, |
|
"epoch": 0.933246921581335, |
|
"grad_norm": 0.21622133027589538, |
|
"kl": 0.02146148681640625, |
|
"learning_rate": 1.1154368557191032e-07, |
|
"loss": 0.0154, |
|
"reward": 1.0935336202383041, |
|
"reward_std": 0.3505462594330311, |
|
"rewards/accuracy_reward": 0.6938775479793549, |
|
"rewards/improved_len_reward_dast": 0.3996560573577881, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 1433.0765075683594, |
|
"epoch": 0.9358392741412832, |
|
"grad_norm": 0.22187489868295793, |
|
"kl": 0.0160064697265625, |
|
"learning_rate": 1.1064224555664489e-07, |
|
"loss": -0.0178, |
|
"reward": 1.2581793367862701, |
|
"reward_std": 0.4055371508002281, |
|
"rewards/accuracy_reward": 0.806122437119484, |
|
"rewards/improved_len_reward_dast": 0.4520568624138832, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 1678.2703857421875, |
|
"epoch": 0.9384316267012314, |
|
"grad_norm": 0.18769832722230134, |
|
"kl": 0.0196075439453125, |
|
"learning_rate": 1.0977702674352485e-07, |
|
"loss": 0.0061, |
|
"reward": 1.533081442117691, |
|
"reward_std": 0.24393456988036633, |
|
"rewards/accuracy_reward": 0.8673469126224518, |
|
"rewards/improved_len_reward_dast": 0.6657344847917557, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 1496.3112030029297, |
|
"epoch": 0.9410239792611795, |
|
"grad_norm": 0.2409591218430649, |
|
"kl": 0.01830291748046875, |
|
"learning_rate": 1.0894810046227007e-07, |
|
"loss": 0.0454, |
|
"reward": 1.3800479769706726, |
|
"reward_std": 0.3536526523530483, |
|
"rewards/accuracy_reward": 0.8316326439380646, |
|
"rewards/improved_len_reward_dast": 0.548415370285511, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 1296.9234313964844, |
|
"epoch": 0.9436163318211277, |
|
"grad_norm": 0.2065960957661233, |
|
"kl": 0.014404296875, |
|
"learning_rate": 1.0815553505059864e-07, |
|
"loss": 0.0346, |
|
"reward": 1.4174171388149261, |
|
"reward_std": 0.3700226917862892, |
|
"rewards/accuracy_reward": 0.8673469126224518, |
|
"rewards/improved_len_reward_dast": 0.5500702187418938, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 1770.8111572265625, |
|
"epoch": 0.9462086843810759, |
|
"grad_norm": 0.22025176867987864, |
|
"kl": 0.0205535888671875, |
|
"learning_rate": 1.0739939584859327e-07, |
|
"loss": 0.0372, |
|
"reward": 1.2784855961799622, |
|
"reward_std": 0.40080468729138374, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.4876692369580269, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 2252.9540405273438, |
|
"epoch": 0.948801036941024, |
|
"grad_norm": 0.25202994466231426, |
|
"kl": 0.028900146484375, |
|
"learning_rate": 1.066797451933144e-07, |
|
"loss": 0.0538, |
|
"reward": 1.052029862999916, |
|
"reward_std": 0.4297824278473854, |
|
"rewards/accuracy_reward": 0.6734693944454193, |
|
"rewards/improved_len_reward_dast": 0.37856047973036766, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 1675.0867309570312, |
|
"epoch": 0.9513933895009722, |
|
"grad_norm": 0.18981437618840255, |
|
"kl": 0.019775390625, |
|
"learning_rate": 1.0599664241366108e-07, |
|
"loss": 0.0215, |
|
"reward": 1.4016070365905762, |
|
"reward_std": 0.4491507261991501, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.5444641783833504, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 2051.3162536621094, |
|
"epoch": 0.9539857420609202, |
|
"grad_norm": 0.18988751309956323, |
|
"kl": 0.0218658447265625, |
|
"learning_rate": 1.0535014382547976e-07, |
|
"loss": -0.0024, |
|
"reward": 1.3321772515773773, |
|
"reward_std": 0.5532524138689041, |
|
"rewards/accuracy_reward": 0.8418367207050323, |
|
"rewards/improved_len_reward_dast": 0.4903404861688614, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 1725.3927917480469, |
|
"epoch": 0.9565780946208684, |
|
"grad_norm": 0.26332331622328803, |
|
"kl": 0.02056884765625, |
|
"learning_rate": 1.0474030272692176e-07, |
|
"loss": -0.0428, |
|
"reward": 1.1207705438137054, |
|
"reward_std": 0.582356795668602, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.33505629003047943, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 1730.3264465332031, |
|
"epoch": 0.9591704471808166, |
|
"grad_norm": 0.23147600575876767, |
|
"kl": 0.020355224609375, |
|
"learning_rate": 1.0416716939404906e-07, |
|
"loss": 0.0207, |
|
"reward": 1.4236516058444977, |
|
"reward_std": 0.4436470791697502, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.5665087997913361, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 2078.234649658203, |
|
"epoch": 0.9617627997407647, |
|
"grad_norm": 0.18318392619509644, |
|
"kl": 0.02490234375, |
|
"learning_rate": 1.0363079107668965e-07, |
|
"loss": 0.0174, |
|
"reward": 1.2476365268230438, |
|
"reward_std": 0.4425313174724579, |
|
"rewards/accuracy_reward": 0.7704081535339355, |
|
"rewards/improved_len_reward_dast": 0.4772283583879471, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 1901.7754821777344, |
|
"epoch": 0.9643551523007129, |
|
"grad_norm": 0.2045058157665467, |
|
"kl": 0.0230865478515625, |
|
"learning_rate": 1.03131211994542e-07, |
|
"loss": 0.0151, |
|
"reward": 1.1136702597141266, |
|
"reward_std": 0.4208161160349846, |
|
"rewards/accuracy_reward": 0.6989795863628387, |
|
"rewards/improved_len_reward_dast": 0.41469068080186844, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 1673.6377563476562, |
|
"epoch": 0.9669475048606611, |
|
"grad_norm": 0.1953573582384899, |
|
"kl": 0.0203399658203125, |
|
"learning_rate": 1.0266847333352986e-07, |
|
"loss": 0.0144, |
|
"reward": 1.2215417325496674, |
|
"reward_std": 0.3687748461961746, |
|
"rewards/accuracy_reward": 0.8061224222183228, |
|
"rewards/improved_len_reward_dast": 0.4154192693531513, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 1465.4744262695312, |
|
"epoch": 0.9695398574206092, |
|
"grad_norm": 0.2392315039852379, |
|
"kl": 0.020263671875, |
|
"learning_rate": 1.022426132424064e-07, |
|
"loss": 0.0264, |
|
"reward": 1.3526732623577118, |
|
"reward_std": 0.2864141073077917, |
|
"rewards/accuracy_reward": 0.8418367058038712, |
|
"rewards/improved_len_reward_dast": 0.5108365193009377, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 1698.5611877441406, |
|
"epoch": 0.9721322099805574, |
|
"grad_norm": 0.22243506530923526, |
|
"kl": 0.018157958984375, |
|
"learning_rate": 1.0185366682960968e-07, |
|
"loss": 0.0368, |
|
"reward": 1.2421083450317383, |
|
"reward_std": 0.3934044614434242, |
|
"rewards/accuracy_reward": 0.7908163070678711, |
|
"rewards/improved_len_reward_dast": 0.451292023062706, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 1694.5101623535156, |
|
"epoch": 0.9747245625405055, |
|
"grad_norm": 0.2049483563870167, |
|
"kl": 0.02301025390625, |
|
"learning_rate": 1.015016661603677e-07, |
|
"loss": 0.0109, |
|
"reward": 1.2675099819898605, |
|
"reward_std": 0.27898336201906204, |
|
"rewards/accuracy_reward": 0.806122437119484, |
|
"rewards/improved_len_reward_dast": 0.4613875336945057, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 1818.9183349609375, |
|
"epoch": 0.9773169151004537, |
|
"grad_norm": 0.2917301156280802, |
|
"kl": 0.022247314453125, |
|
"learning_rate": 1.011866402540555e-07, |
|
"loss": 0.052, |
|
"reward": 1.2979410141706467, |
|
"reward_std": 0.4051199574023485, |
|
"rewards/accuracy_reward": 0.8010203987360001, |
|
"rewards/improved_len_reward_dast": 0.4969206303358078, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 1676.4030151367188, |
|
"epoch": 0.9799092676604018, |
|
"grad_norm": 0.19999847167358073, |
|
"kl": 0.0189666748046875, |
|
"learning_rate": 1.0090861508180229e-07, |
|
"loss": 0.0173, |
|
"reward": 1.307900682091713, |
|
"reward_std": 0.36051470041275024, |
|
"rewards/accuracy_reward": 0.806122437119484, |
|
"rewards/improved_len_reward_dast": 0.5017782524228096, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 1303.3468933105469, |
|
"epoch": 0.9825016202203499, |
|
"grad_norm": 0.23002851272315084, |
|
"kl": 0.016387939453125, |
|
"learning_rate": 1.006676135643506e-07, |
|
"loss": 0.0223, |
|
"reward": 1.5040651261806488, |
|
"reward_std": 0.28981203213334084, |
|
"rewards/accuracy_reward": 0.8877550810575485, |
|
"rewards/improved_len_reward_dast": 0.6163100153207779, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 1699.98974609375, |
|
"epoch": 0.9850939727802981, |
|
"grad_norm": 0.2773167363062717, |
|
"kl": 0.021759033203125, |
|
"learning_rate": 1.004636555701666e-07, |
|
"loss": -0.0024, |
|
"reward": 1.3300544768571854, |
|
"reward_std": 0.4332263544201851, |
|
"rewards/accuracy_reward": 0.857142835855484, |
|
"rewards/improved_len_reward_dast": 0.47291168570518494, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 2158.5560607910156, |
|
"epoch": 0.9876863253402463, |
|
"grad_norm": 0.19893298725270195, |
|
"kl": 0.027099609375, |
|
"learning_rate": 1.0029675791380211e-07, |
|
"loss": 0.0245, |
|
"reward": 1.366698831319809, |
|
"reward_std": 0.3425176590681076, |
|
"rewards/accuracy_reward": 0.8112244755029678, |
|
"rewards/improved_len_reward_dast": 0.5554743856191635, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 1771.0765075683594, |
|
"epoch": 0.9902786779001944, |
|
"grad_norm": 0.21454331685840108, |
|
"kl": 0.025909423828125, |
|
"learning_rate": 1.0016693435450846e-07, |
|
"loss": 0.0522, |
|
"reward": 1.1434401869773865, |
|
"reward_std": 0.518133670091629, |
|
"rewards/accuracy_reward": 0.7448979467153549, |
|
"rewards/improved_len_reward_dast": 0.39854224771261215, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 1916.8673095703125, |
|
"epoch": 0.9928710304601426, |
|
"grad_norm": 0.21868762838968606, |
|
"kl": 0.0216217041015625, |
|
"learning_rate": 1.00074195595102e-07, |
|
"loss": 0.0149, |
|
"reward": 1.2855271100997925, |
|
"reward_std": 0.4449741840362549, |
|
"rewards/accuracy_reward": 0.7857142686843872, |
|
"rewards/improved_len_reward_dast": 0.4998128265142441, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 1359.0254821777344, |
|
"epoch": 0.9954633830200907, |
|
"grad_norm": 0.22146763439588837, |
|
"kl": 0.01685333251953125, |
|
"learning_rate": 1.0001854928108199e-07, |
|
"loss": -0.0267, |
|
"reward": 1.3678375780582428, |
|
"reward_std": 0.3422878012061119, |
|
"rewards/accuracy_reward": 0.8214285671710968, |
|
"rewards/improved_len_reward_dast": 0.5464089959859848, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 1564.7193908691406, |
|
"epoch": 0.9980557355800389, |
|
"grad_norm": 0.29725903676415294, |
|
"kl": 0.019683837890625, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0597, |
|
"reward": 1.2890927195549011, |
|
"reward_std": 0.3781392499804497, |
|
"rewards/accuracy_reward": 0.795918345451355, |
|
"rewards/improved_len_reward_dast": 0.49317440390586853, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9980557355800389, |
|
"step": 385, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0015093988140246698, |
|
"train_runtime": 5817.5821, |
|
"train_samples_per_second": 1.856, |
|
"train_steps_per_second": 0.066 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 385, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 7, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|