diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5047 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9980557355800389, + "eval_steps": 500, + "global_step": 385, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 1848.5458984375, + "epoch": 0.002592352559948153, + "grad_norm": 0.15412024450495956, + "kl": 0.0, + "learning_rate": 2.564102564102564e-08, + "loss": 0.0246, + "reward": 1.4397025108337402, + "reward_std": 0.4701927825808525, + "rewards/accuracy_reward": 0.8418367207050323, + "rewards/improved_len_reward_dast": 0.5978657752275467, + "step": 1 + }, + { + "completion_length": 2130.4540100097656, + "epoch": 0.005184705119896306, + "grad_norm": 0.19408049978062328, + "kl": 0.0, + "learning_rate": 5.128205128205128e-08, + "loss": 0.0596, + "reward": 1.0504228472709656, + "reward_std": 0.31693385541439056, + "rewards/accuracy_reward": 0.6938775479793549, + "rewards/improved_len_reward_dast": 0.3565452881157398, + "step": 2 + }, + { + "completion_length": 2034.2958679199219, + "epoch": 0.007777057679844459, + "grad_norm": 0.1531077683543166, + "kl": 0.0001348257064819336, + "learning_rate": 7.692307692307692e-08, + "loss": -0.0129, + "reward": 1.0101122856140137, + "reward_std": 0.4455054961144924, + "rewards/accuracy_reward": 0.6581632494926453, + "rewards/improved_len_reward_dast": 0.3519490174949169, + "step": 3 + }, + { + "completion_length": 2119.744903564453, + "epoch": 0.010369410239792612, + "grad_norm": 0.1349622041652031, + "kl": 0.00012981891632080078, + "learning_rate": 1.0256410256410256e-07, + "loss": -0.0044, + "reward": 1.2723601460456848, + "reward_std": 0.4871401861310005, + "rewards/accuracy_reward": 0.806122437119484, + "rewards/improved_len_reward_dast": 0.46623772382736206, + "step": 4 + }, + { + "completion_length": 1834.7652893066406, + "epoch": 0.012961762799740765, + "grad_norm": 0.16434839601505108, + "kl": 0.00012123584747314453, + "learning_rate": 1.2820512820512818e-07, + "loss": 0.0443, + "reward": 1.267708569765091, + "reward_std": 0.3166223168373108, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.5024024695158005, + "step": 5 + }, + { + "completion_length": 2152.540740966797, + "epoch": 0.015554115359688918, + "grad_norm": 0.15696438577129812, + "kl": 0.00012969970703125, + "learning_rate": 1.5384615384615385e-07, + "loss": -0.0129, + "reward": 1.0658827871084213, + "reward_std": 0.4334075152873993, + "rewards/accuracy_reward": 0.7142857164144516, + "rewards/improved_len_reward_dast": 0.35159702971577644, + "step": 6 + }, + { + "completion_length": 1747.4591674804688, + "epoch": 0.01814646791963707, + "grad_norm": 0.15893508336342455, + "kl": 0.00010186433792114258, + "learning_rate": 1.7948717948717948e-07, + "loss": 0.0429, + "reward": 1.1448375135660172, + "reward_std": 0.37509680539369583, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.3846333734691143, + "step": 7 + }, + { + "completion_length": 1834.0611572265625, + "epoch": 0.020738820479585224, + "grad_norm": 0.1573166657366275, + "kl": 0.00011396408081054688, + "learning_rate": 2.0512820512820512e-07, + "loss": 0.0036, + "reward": 1.272167608141899, + "reward_std": 0.3015933446586132, + "rewards/accuracy_reward": 0.8010203838348389, + "rewards/improved_len_reward_dast": 0.47114718705415726, + "step": 8 + }, + { + "completion_length": 2077.1122131347656, + "epoch": 0.023331173039533377, + "grad_norm": 0.15123878128380125, + "kl": 0.0001251697540283203, + "learning_rate": 2.3076923076923078e-07, + "loss": 0.0025, + "reward": 1.1346809566020966, + "reward_std": 0.44101474434137344, + "rewards/accuracy_reward": 0.7448979467153549, + "rewards/improved_len_reward_dast": 0.38978295773267746, + "step": 9 + }, + { + "completion_length": 2001.6989135742188, + "epoch": 0.02592352559948153, + "grad_norm": 0.15946517595083978, + "kl": 0.00013494491577148438, + "learning_rate": 2.5641025641025636e-07, + "loss": 0.0414, + "reward": 1.0840217173099518, + "reward_std": 0.37720372527837753, + "rewards/accuracy_reward": 0.7244897931814194, + "rewards/improved_len_reward_dast": 0.3595319651067257, + "step": 10 + }, + { + "completion_length": 2258.3468322753906, + "epoch": 0.028515878159429683, + "grad_norm": 0.16258661653616813, + "kl": 0.0001423358917236328, + "learning_rate": 2.8205128205128203e-07, + "loss": -0.0035, + "reward": 1.035923331975937, + "reward_std": 0.44437722116708755, + "rewards/accuracy_reward": 0.6989795863628387, + "rewards/improved_len_reward_dast": 0.33694368600845337, + "step": 11 + }, + { + "completion_length": 2071.6019897460938, + "epoch": 0.031108230719377836, + "grad_norm": 0.15520698307030686, + "kl": 0.0001367330551147461, + "learning_rate": 3.076923076923077e-07, + "loss": 0.0151, + "reward": 1.1415546834468842, + "reward_std": 0.37767674773931503, + "rewards/accuracy_reward": 0.7653061002492905, + "rewards/improved_len_reward_dast": 0.3762484937906265, + "step": 12 + }, + { + "completion_length": 1976.1530151367188, + "epoch": 0.033700583279325985, + "grad_norm": 0.17189810461087038, + "kl": 0.00012564659118652344, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0019, + "reward": 1.125291794538498, + "reward_std": 0.4003720059990883, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.36508774384856224, + "step": 13 + }, + { + "completion_length": 2114.5612182617188, + "epoch": 0.03629293583927414, + "grad_norm": 0.18307106761606742, + "kl": 0.00011533498764038086, + "learning_rate": 3.5897435897435896e-07, + "loss": 0.0248, + "reward": 1.0526445508003235, + "reward_std": 0.33728349953889847, + "rewards/accuracy_reward": 0.6530612111091614, + "rewards/improved_len_reward_dast": 0.3995833285152912, + "step": 14 + }, + { + "completion_length": 1440.3571166992188, + "epoch": 0.03888528839922229, + "grad_norm": 0.19219239961861387, + "kl": 7.677078247070312e-05, + "learning_rate": 3.8461538461538463e-07, + "loss": 0.0411, + "reward": 1.3660516738891602, + "reward_std": 0.2804589569568634, + "rewards/accuracy_reward": 0.9030611962080002, + "rewards/improved_len_reward_dast": 0.46299050748348236, + "step": 15 + }, + { + "completion_length": 1305.2295684814453, + "epoch": 0.04147764095917045, + "grad_norm": 0.18960595204343547, + "kl": 9.632110595703125e-05, + "learning_rate": 4.1025641025641024e-07, + "loss": 0.0021, + "reward": 1.418413519859314, + "reward_std": 0.44618362933397293, + "rewards/accuracy_reward": 0.9132652878761292, + "rewards/improved_len_reward_dast": 0.5051482394337654, + "step": 16 + }, + { + "completion_length": 1996.841796875, + "epoch": 0.0440699935191186, + "grad_norm": 0.16908596036858053, + "kl": 0.00011074542999267578, + "learning_rate": 4.358974358974359e-07, + "loss": 0.0341, + "reward": 1.1314191222190857, + "reward_std": 0.6118374243378639, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.37121502310037613, + "step": 17 + }, + { + "completion_length": 1431.4846801757812, + "epoch": 0.046662346079066754, + "grad_norm": 0.22735446925703126, + "kl": 8.571147918701172e-05, + "learning_rate": 4.6153846153846156e-07, + "loss": 0.0407, + "reward": 1.206202208995819, + "reward_std": 0.3719758912920952, + "rewards/accuracy_reward": 0.8469387739896774, + "rewards/improved_len_reward_dast": 0.3592635001987219, + "step": 18 + }, + { + "completion_length": 1709.688720703125, + "epoch": 0.0492546986390149, + "grad_norm": 0.18577878700500422, + "kl": 0.00010377168655395508, + "learning_rate": 4.871794871794871e-07, + "loss": 0.0417, + "reward": 1.1775241941213608, + "reward_std": 0.5288017690181732, + "rewards/accuracy_reward": 0.7806122303009033, + "rewards/improved_len_reward_dast": 0.39691203087568283, + "step": 19 + }, + { + "completion_length": 1838.2754821777344, + "epoch": 0.05184705119896306, + "grad_norm": 0.16046849749418657, + "kl": 0.00011777877807617188, + "learning_rate": 5.128205128205127e-07, + "loss": 0.0208, + "reward": 1.1064813733100891, + "reward_std": 0.5807419717311859, + "rewards/accuracy_reward": 0.7551020234823227, + "rewards/improved_len_reward_dast": 0.3513793312013149, + "step": 20 + }, + { + "completion_length": 2217.14794921875, + "epoch": 0.05443940375891121, + "grad_norm": 0.1963426577198746, + "kl": 0.00014448165893554688, + "learning_rate": 5.384615384615384e-07, + "loss": 0.0467, + "reward": 1.0558834075927734, + "reward_std": 0.558340422809124, + "rewards/accuracy_reward": 0.6887754797935486, + "rewards/improved_len_reward_dast": 0.36710788309574127, + "step": 21 + }, + { + "completion_length": 1927.3316040039062, + "epoch": 0.057031756318859365, + "grad_norm": 0.18525325793381328, + "kl": 9.930133819580078e-05, + "learning_rate": 5.641025641025641e-07, + "loss": 0.0242, + "reward": 1.1790167838335037, + "reward_std": 0.4690204933285713, + "rewards/accuracy_reward": 0.7857142835855484, + "rewards/improved_len_reward_dast": 0.39330248534679413, + "step": 22 + }, + { + "completion_length": 1841.6938171386719, + "epoch": 0.059624108878807515, + "grad_norm": 0.17253945143916685, + "kl": 0.00010156631469726562, + "learning_rate": 5.897435897435898e-07, + "loss": 0.0724, + "reward": 1.3324860334396362, + "reward_std": 0.28684910759329796, + "rewards/accuracy_reward": 0.8010203987360001, + "rewards/improved_len_reward_dast": 0.5314656794071198, + "step": 23 + }, + { + "completion_length": 1679.9642333984375, + "epoch": 0.06221646143875567, + "grad_norm": 0.20870673606371046, + "kl": 0.00012958049774169922, + "learning_rate": 6.153846153846154e-07, + "loss": 0.0467, + "reward": 1.1419631987810135, + "reward_std": 0.38880112022161484, + "rewards/accuracy_reward": 0.8010203838348389, + "rewards/improved_len_reward_dast": 0.3409428298473358, + "step": 24 + }, + { + "completion_length": 2278.8673095703125, + "epoch": 0.06480881399870382, + "grad_norm": 0.15316366458717245, + "kl": 0.00015485286712646484, + "learning_rate": 6.410256410256411e-07, + "loss": 0.0203, + "reward": 0.9916537553071976, + "reward_std": 0.43884778022766113, + "rewards/accuracy_reward": 0.6479591578245163, + "rewards/improved_len_reward_dast": 0.3436945825815201, + "step": 25 + }, + { + "completion_length": 1853.4744873046875, + "epoch": 0.06740116655865197, + "grad_norm": 0.1623211083206233, + "kl": 0.0001201629638671875, + "learning_rate": 6.666666666666666e-07, + "loss": 0.054, + "reward": 1.1868394315242767, + "reward_std": 0.4521937184035778, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.42663537338376045, + "step": 26 + }, + { + "completion_length": 1726.6427917480469, + "epoch": 0.06999351911860013, + "grad_norm": 0.21873628771810408, + "kl": 0.0001125335693359375, + "learning_rate": 6.923076923076922e-07, + "loss": 0.086, + "reward": 1.2924230992794037, + "reward_std": 0.41079702973365784, + "rewards/accuracy_reward": 0.8418367356061935, + "rewards/improved_len_reward_dast": 0.45058638602495193, + "step": 27 + }, + { + "completion_length": 1667.6071166992188, + "epoch": 0.07258587167854828, + "grad_norm": 0.18905776966101132, + "kl": 0.00011527538299560547, + "learning_rate": 7.179487179487179e-07, + "loss": 0.045, + "reward": 1.2638164162635803, + "reward_std": 0.2763877250254154, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.4525919631123543, + "step": 28 + }, + { + "completion_length": 2032.4132080078125, + "epoch": 0.07517822423849643, + "grad_norm": 0.15326481666027458, + "kl": 0.00012993812561035156, + "learning_rate": 7.435897435897435e-07, + "loss": 0.0002, + "reward": 1.1888954937458038, + "reward_std": 0.41189244389533997, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.42869146168231964, + "step": 29 + }, + { + "completion_length": 1764.4999389648438, + "epoch": 0.07777057679844458, + "grad_norm": 0.13723640714210214, + "kl": 9.167194366455078e-05, + "learning_rate": 7.692307692307693e-07, + "loss": -0.0066, + "reward": 1.0674456059932709, + "reward_std": 0.4443123862147331, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.2970374431461096, + "step": 30 + }, + { + "completion_length": 2198.729522705078, + "epoch": 0.08036292935839275, + "grad_norm": 0.15079546325320037, + "kl": 0.0001614093780517578, + "learning_rate": 7.948717948717948e-07, + "loss": 0.013, + "reward": 1.3089748322963715, + "reward_std": 0.5274734199047089, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.48754626512527466, + "step": 31 + }, + { + "completion_length": 1879.6376647949219, + "epoch": 0.0829552819183409, + "grad_norm": 0.18155740478939822, + "kl": 0.0001251697540283203, + "learning_rate": 8.205128205128205e-07, + "loss": 0.0131, + "reward": 1.0791111141443253, + "reward_std": 0.46941038966178894, + "rewards/accuracy_reward": 0.7346938699483871, + "rewards/improved_len_reward_dast": 0.34441729076206684, + "step": 32 + }, + { + "completion_length": 1981.6274719238281, + "epoch": 0.08554763447828904, + "grad_norm": 0.1572483646834791, + "kl": 0.0001424551010131836, + "learning_rate": 8.461538461538461e-07, + "loss": 0.0476, + "reward": 1.3903695046901703, + "reward_std": 0.4975530132651329, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.5332267209887505, + "step": 33 + }, + { + "completion_length": 2061.9999389648438, + "epoch": 0.0881399870382372, + "grad_norm": 0.1901994694040778, + "kl": 0.0001537799835205078, + "learning_rate": 8.717948717948718e-07, + "loss": 0.0481, + "reward": 1.1052793562412262, + "reward_std": 0.4630768448114395, + "rewards/accuracy_reward": 0.7448979467153549, + "rewards/improved_len_reward_dast": 0.36038143932819366, + "step": 34 + }, + { + "completion_length": 2465.1224060058594, + "epoch": 0.09073233959818536, + "grad_norm": 0.15096654762075654, + "kl": 0.0001761913299560547, + "learning_rate": 8.974358974358974e-07, + "loss": 0.0009, + "reward": 0.7364223003387451, + "reward_std": 0.4229283332824707, + "rewards/accuracy_reward": 0.5357142835855484, + "rewards/improved_len_reward_dast": 0.20070804562419653, + "step": 35 + }, + { + "completion_length": 2199.688720703125, + "epoch": 0.09332469215813351, + "grad_norm": 0.1791438585472734, + "kl": 0.0001895427703857422, + "learning_rate": 9.230769230769231e-07, + "loss": 0.0399, + "reward": 1.2042141258716583, + "reward_std": 0.3516070544719696, + "rewards/accuracy_reward": 0.7755101770162582, + "rewards/improved_len_reward_dast": 0.4287039190530777, + "step": 36 + }, + { + "completion_length": 2019.6478881835938, + "epoch": 0.09591704471808166, + "grad_norm": 0.1921872688604767, + "kl": 0.00020241737365722656, + "learning_rate": 9.487179487179486e-07, + "loss": 0.0187, + "reward": 1.3608680367469788, + "reward_std": 0.4326165243983269, + "rewards/accuracy_reward": 0.8316326439380646, + "rewards/improved_len_reward_dast": 0.5292353481054306, + "step": 37 + }, + { + "completion_length": 1693.0, + "epoch": 0.0985093972780298, + "grad_norm": 0.19045468187511366, + "kl": 0.0001348257064819336, + "learning_rate": 9.743589743589742e-07, + "loss": 0.0464, + "reward": 1.3455627113580704, + "reward_std": 0.3586850240826607, + "rewards/accuracy_reward": 0.846938744187355, + "rewards/improved_len_reward_dast": 0.49862393736839294, + "step": 38 + }, + { + "completion_length": 2374.637725830078, + "epoch": 0.10110174983797797, + "grad_norm": 0.13494398794899917, + "kl": 0.0002028942108154297, + "learning_rate": 1e-06, + "loss": 0.0272, + "reward": 0.8414318859577179, + "reward_std": 0.48852086812257767, + "rewards/accuracy_reward": 0.6224489659070969, + "rewards/improved_len_reward_dast": 0.21898294147104025, + "step": 39 + }, + { + "completion_length": 2517.3162841796875, + "epoch": 0.10369410239792612, + "grad_norm": 0.16744933736297124, + "kl": 0.0002105236053466797, + "learning_rate": 9.99981450718918e-07, + "loss": 0.0616, + "reward": 0.9213714599609375, + "reward_std": 0.43374133110046387, + "rewards/accuracy_reward": 0.6275510042905807, + "rewards/improved_len_reward_dast": 0.2938204384408891, + "step": 40 + }, + { + "completion_length": 1807.0203857421875, + "epoch": 0.10628645495787427, + "grad_norm": 0.15669439739322064, + "kl": 0.0002703666687011719, + "learning_rate": 9.99925804404898e-07, + "loss": 0.0228, + "reward": 0.994490772485733, + "reward_std": 0.5202224850654602, + "rewards/accuracy_reward": 0.7193877547979355, + "rewards/improved_len_reward_dast": 0.27510301768779755, + "step": 41 + }, + { + "completion_length": 1907.0305786132812, + "epoch": 0.10887880751782242, + "grad_norm": 0.1507066292700219, + "kl": 0.00019288063049316406, + "learning_rate": 9.998330656454915e-07, + "loss": 0.0566, + "reward": 1.3084075152873993, + "reward_std": 0.3637009263038635, + "rewards/accuracy_reward": 0.8367346823215485, + "rewards/improved_len_reward_dast": 0.4716728553175926, + "step": 42 + }, + { + "completion_length": 1946.2958984375, + "epoch": 0.11147116007777058, + "grad_norm": 0.21826053334493506, + "kl": 0.0002913475036621094, + "learning_rate": 9.99703242086198e-07, + "loss": 0.0894, + "reward": 1.0715700536966324, + "reward_std": 0.4503963589668274, + "rewards/accuracy_reward": 0.7397958934307098, + "rewards/improved_len_reward_dast": 0.3317741868086159, + "step": 43 + }, + { + "completion_length": 1862.9591674804688, + "epoch": 0.11406351263771873, + "grad_norm": 0.18297677442826724, + "kl": 0.000263214111328125, + "learning_rate": 9.995363444298333e-07, + "loss": 0.037, + "reward": 1.2490134239196777, + "reward_std": 0.4328879788517952, + "rewards/accuracy_reward": 0.7653061076998711, + "rewards/improved_len_reward_dast": 0.4837072864174843, + "step": 44 + }, + { + "completion_length": 2316.530517578125, + "epoch": 0.11665586519766688, + "grad_norm": 0.15141936649503004, + "kl": 0.0003380775451660156, + "learning_rate": 9.993323864356492e-07, + "loss": 0.0182, + "reward": 0.7743872255086899, + "reward_std": 0.55930295586586, + "rewards/accuracy_reward": 0.5765305981040001, + "rewards/improved_len_reward_dast": 0.19785663951188326, + "step": 45 + }, + { + "completion_length": 2924.1683349609375, + "epoch": 0.11924821775761503, + "grad_norm": 0.12614913947783052, + "kl": 0.0002567768096923828, + "learning_rate": 9.990913849181977e-07, + "loss": 0.0096, + "reward": 0.8433035537600517, + "reward_std": 0.41744476184248924, + "rewards/accuracy_reward": 0.5561224333941936, + "rewards/improved_len_reward_dast": 0.28718107007443905, + "step": 46 + }, + { + "completion_length": 1805.5203552246094, + "epoch": 0.1218405703175632, + "grad_norm": 0.15881163011201838, + "kl": 0.0007009506225585938, + "learning_rate": 9.988133597459444e-07, + "loss": 0.0175, + "reward": 1.1679251790046692, + "reward_std": 0.4487800747156143, + "rewards/accuracy_reward": 0.795918345451355, + "rewards/improved_len_reward_dast": 0.3720068037509918, + "step": 47 + }, + { + "completion_length": 1873.7499389648438, + "epoch": 0.12443292287751134, + "grad_norm": 0.1713187626068608, + "kl": 0.00028634071350097656, + "learning_rate": 9.984983338396323e-07, + "loss": 0.0488, + "reward": 1.2101139575242996, + "reward_std": 0.33226554840803146, + "rewards/accuracy_reward": 0.760204091668129, + "rewards/improved_len_reward_dast": 0.44990991055965424, + "step": 48 + }, + { + "completion_length": 1411.4234161376953, + "epoch": 0.1270252754374595, + "grad_norm": 0.18215178056260903, + "kl": 0.0005662441253662109, + "learning_rate": 9.981463331703903e-07, + "loss": 0.0348, + "reward": 1.4565084278583527, + "reward_std": 0.3240164965391159, + "rewards/accuracy_reward": 0.867346927523613, + "rewards/improved_len_reward_dast": 0.5891614705324173, + "step": 49 + }, + { + "completion_length": 1923.6836547851562, + "epoch": 0.12961762799740764, + "grad_norm": 0.21182741369137464, + "kl": 0.00043964385986328125, + "learning_rate": 9.977573867575937e-07, + "loss": 0.0483, + "reward": 1.0672244429588318, + "reward_std": 0.42784378305077553, + "rewards/accuracy_reward": 0.7244897782802582, + "rewards/improved_len_reward_dast": 0.342734657227993, + "step": 50 + }, + { + "completion_length": 2293.10205078125, + "epoch": 0.1322099805573558, + "grad_norm": 0.17784622321620705, + "kl": 0.0005965232849121094, + "learning_rate": 9.9733152666647e-07, + "loss": 0.0011, + "reward": 1.119166985154152, + "reward_std": 0.4692757725715637, + "rewards/accuracy_reward": 0.6836734563112259, + "rewards/improved_len_reward_dast": 0.43549349159002304, + "step": 51 + }, + { + "completion_length": 2606.8468627929688, + "epoch": 0.13480233311730394, + "grad_norm": 0.16188767449887392, + "kl": 0.0004382133483886719, + "learning_rate": 9.968687880054579e-07, + "loss": 0.0355, + "reward": 1.0624671429395676, + "reward_std": 0.5272083953022957, + "rewards/accuracy_reward": 0.6530612111091614, + "rewards/improved_len_reward_dast": 0.4094058535993099, + "step": 52 + }, + { + "completion_length": 1741.494873046875, + "epoch": 0.1373946856772521, + "grad_norm": 0.18163262147540796, + "kl": 0.0007987022399902344, + "learning_rate": 9.963692089233104e-07, + "loss": 0.0189, + "reward": 1.1586879789829254, + "reward_std": 0.3523149788379669, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.3678716644644737, + "step": 53 + }, + { + "completion_length": 1731.5713806152344, + "epoch": 0.13998703823720027, + "grad_norm": 0.17545616003222686, + "kl": 0.000713348388671875, + "learning_rate": 9.958328306059508e-07, + "loss": 0.0163, + "reward": 1.087464839220047, + "reward_std": 0.37970298528671265, + "rewards/accuracy_reward": 0.7499999701976776, + "rewards/improved_len_reward_dast": 0.3374648429453373, + "step": 54 + }, + { + "completion_length": 1940.2244262695312, + "epoch": 0.1425793907971484, + "grad_norm": 0.20829916863603212, + "kl": 0.0008840560913085938, + "learning_rate": 9.952596972730782e-07, + "loss": 0.0418, + "reward": 1.136895164847374, + "reward_std": 0.21965472772717476, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.37158904783427715, + "step": 55 + }, + { + "completion_length": 2024.3825988769531, + "epoch": 0.14517174335709657, + "grad_norm": 0.16061899047482414, + "kl": 0.0006990432739257812, + "learning_rate": 9.946498561745201e-07, + "loss": 0.0061, + "reward": 1.3091870546340942, + "reward_std": 0.42107394337654114, + "rewards/accuracy_reward": 0.8010203987360001, + "rewards/improved_len_reward_dast": 0.50816660374403, + "step": 56 + }, + { + "completion_length": 1990.7856750488281, + "epoch": 0.14776409591704473, + "grad_norm": 0.17205784813401187, + "kl": 0.0008096694946289062, + "learning_rate": 9.94003357586339e-07, + "loss": 0.0362, + "reward": 1.3399446904659271, + "reward_std": 0.34059275686740875, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.5185160860419273, + "step": 57 + }, + { + "completion_length": 2279.331573486328, + "epoch": 0.15035644847699287, + "grad_norm": 0.1637215457597632, + "kl": 0.0006699562072753906, + "learning_rate": 9.933202548066855e-07, + "loss": 0.0424, + "reward": 1.0715169459581375, + "reward_std": 0.39220181107521057, + "rewards/accuracy_reward": 0.6887754946947098, + "rewards/improved_len_reward_dast": 0.38274142518639565, + "step": 58 + }, + { + "completion_length": 2313.2499084472656, + "epoch": 0.15294880103694103, + "grad_norm": 0.16376379786761341, + "kl": 0.00083160400390625, + "learning_rate": 9.926006041514068e-07, + "loss": 0.0178, + "reward": 1.142714947462082, + "reward_std": 0.3937602676451206, + "rewards/accuracy_reward": 0.739795908331871, + "rewards/improved_len_reward_dast": 0.40291906148195267, + "step": 59 + }, + { + "completion_length": 2046.1631774902344, + "epoch": 0.15554115359688916, + "grad_norm": 0.23236942157628335, + "kl": 0.0009450912475585938, + "learning_rate": 9.918444649494012e-07, + "loss": 0.0662, + "reward": 1.245220124721527, + "reward_std": 0.2695602234452963, + "rewards/accuracy_reward": 0.7755101770162582, + "rewards/improved_len_reward_dast": 0.46970994770526886, + "step": 60 + }, + { + "completion_length": 2175.6224060058594, + "epoch": 0.15813350615683733, + "grad_norm": 0.15376927864805173, + "kl": 0.0009765625, + "learning_rate": 9.9105189953773e-07, + "loss": 0.0196, + "reward": 1.2470524311065674, + "reward_std": 0.45635347813367844, + "rewards/accuracy_reward": 0.7653061002492905, + "rewards/improved_len_reward_dast": 0.48174627125263214, + "step": 61 + }, + { + "completion_length": 2337.1581115722656, + "epoch": 0.1607258587167855, + "grad_norm": 0.15218316765828901, + "kl": 0.0008411407470703125, + "learning_rate": 9.90222973256475e-07, + "loss": 0.0249, + "reward": 1.37412428855896, + "reward_std": 0.39829079806804657, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.552695706486702, + "step": 62 + }, + { + "completion_length": 2680.4183349609375, + "epoch": 0.16331821127673363, + "grad_norm": 0.21218309711028285, + "kl": 0.0010118484497070312, + "learning_rate": 9.89357754443355e-07, + "loss": 0.0529, + "reward": 0.8223338723182678, + "reward_std": 0.4073232337832451, + "rewards/accuracy_reward": 0.5510203987360001, + "rewards/improved_len_reward_dast": 0.2713134288787842, + "step": 63 + }, + { + "completion_length": 2635.7550048828125, + "epoch": 0.1659105638366818, + "grad_norm": 0.1620590183136494, + "kl": 0.000949859619140625, + "learning_rate": 9.884563144280897e-07, + "loss": 0.0464, + "reward": 1.0863047987222672, + "reward_std": 0.4714929535984993, + "rewards/accuracy_reward": 0.678571417927742, + "rewards/improved_len_reward_dast": 0.40773337706923485, + "step": 64 + }, + { + "completion_length": 1972.2907104492188, + "epoch": 0.16850291639662995, + "grad_norm": 0.17935605548712222, + "kl": 0.001079559326171875, + "learning_rate": 9.875187275265198e-07, + "loss": 0.0255, + "reward": 1.2364896833896637, + "reward_std": 0.4289153516292572, + "rewards/accuracy_reward": 0.7959183603525162, + "rewards/improved_len_reward_dast": 0.44057128578424454, + "step": 65 + }, + { + "completion_length": 2525.2091064453125, + "epoch": 0.1710952689565781, + "grad_norm": 0.14682421707314297, + "kl": 0.0012102127075195312, + "learning_rate": 9.865450710344807e-07, + "loss": 0.0344, + "reward": 0.8753379732370377, + "reward_std": 0.3238606099039316, + "rewards/accuracy_reward": 0.5918367132544518, + "rewards/improved_len_reward_dast": 0.2835012301802635, + "step": 66 + }, + { + "completion_length": 2308.1478576660156, + "epoch": 0.17368762151652625, + "grad_norm": 0.17311806443951758, + "kl": 0.001552581787109375, + "learning_rate": 9.855354252214307e-07, + "loss": 0.0564, + "reward": 1.152388408780098, + "reward_std": 0.4479888826608658, + "rewards/accuracy_reward": 0.7653061002492905, + "rewards/improved_len_reward_dast": 0.3870823085308075, + "step": 67 + }, + { + "completion_length": 1699.9540405273438, + "epoch": 0.1762799740764744, + "grad_norm": 0.18795647394996712, + "kl": 0.0012683868408203125, + "learning_rate": 9.844898733238311e-07, + "loss": 0.0538, + "reward": 1.4352277517318726, + "reward_std": 0.30926575139164925, + "rewards/accuracy_reward": 0.867346927523613, + "rewards/improved_len_reward_dast": 0.5678808689117432, + "step": 68 + }, + { + "completion_length": 1942.3876953125, + "epoch": 0.17887232663642255, + "grad_norm": 0.2210659776524768, + "kl": 0.0016345977783203125, + "learning_rate": 9.83408501538287e-07, + "loss": -0.0183, + "reward": 1.0560709834098816, + "reward_std": 0.44945112615823746, + "rewards/accuracy_reward": 0.7346938699483871, + "rewards/improved_len_reward_dast": 0.32137710228562355, + "step": 69 + }, + { + "completion_length": 1671.9642639160156, + "epoch": 0.18146467919637072, + "grad_norm": 0.19750773670302219, + "kl": 0.0015382766723632812, + "learning_rate": 9.822913990144387e-07, + "loss": 0.0167, + "reward": 1.1308622658252716, + "reward_std": 0.4337487518787384, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.2737194411456585, + "step": 70 + }, + { + "completion_length": 2116.3571166992188, + "epoch": 0.18405703175631885, + "grad_norm": 0.1778004806410334, + "kl": 0.00168609619140625, + "learning_rate": 9.811386578476146e-07, + "loss": 0.0029, + "reward": 1.2179836481809616, + "reward_std": 0.46442168205976486, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.457779623568058, + "step": 71 + }, + { + "completion_length": 1906.9795532226562, + "epoch": 0.18664938431626701, + "grad_norm": 0.1986625505084921, + "kl": 0.001316070556640625, + "learning_rate": 9.79950373071236e-07, + "loss": 0.0285, + "reward": 1.1908049881458282, + "reward_std": 0.3781607896089554, + "rewards/accuracy_reward": 0.7244897931814194, + "rewards/improved_len_reward_dast": 0.4663151800632477, + "step": 72 + }, + { + "completion_length": 1938.2652587890625, + "epoch": 0.18924173687621518, + "grad_norm": 0.178605084347928, + "kl": 0.001659393310546875, + "learning_rate": 9.787266426489845e-07, + "loss": 0.0145, + "reward": 1.233821153640747, + "reward_std": 0.40631671994924545, + "rewards/accuracy_reward": 0.7704081386327744, + "rewards/improved_len_reward_dast": 0.46341295540332794, + "step": 73 + }, + { + "completion_length": 2097.5152587890625, + "epoch": 0.1918340894361633, + "grad_norm": 0.21993776817198404, + "kl": 0.0017414093017578125, + "learning_rate": 9.77467567466725e-07, + "loss": 0.0586, + "reward": 1.0030385106801987, + "reward_std": 0.48096026852726936, + "rewards/accuracy_reward": 0.6989795863628387, + "rewards/improved_len_reward_dast": 0.30405890196561813, + "step": 74 + }, + { + "completion_length": 2267.7193298339844, + "epoch": 0.19442644199611148, + "grad_norm": 0.25966079935566605, + "kl": 0.002155303955078125, + "learning_rate": 9.761732513241882e-07, + "loss": 0.1164, + "reward": 1.1867494583129883, + "reward_std": 0.36580438911914825, + "rewards/accuracy_reward": 0.7346938699483871, + "rewards/improved_len_reward_dast": 0.45205555111169815, + "step": 75 + }, + { + "completion_length": 1932.4285278320312, + "epoch": 0.1970187945560596, + "grad_norm": 0.18810468542751257, + "kl": 0.0028076171875, + "learning_rate": 9.748438009264142e-07, + "loss": 0.0311, + "reward": 1.302773892879486, + "reward_std": 0.3699945732951164, + "rewards/accuracy_reward": 0.8265306055545807, + "rewards/improved_len_reward_dast": 0.4762432426214218, + "step": 76 + }, + { + "completion_length": 2192.2601928710938, + "epoch": 0.19961114711600778, + "grad_norm": 0.1818517530996337, + "kl": 0.002178192138671875, + "learning_rate": 9.734793258749538e-07, + "loss": 0.0556, + "reward": 1.2119455933570862, + "reward_std": 0.33562129363417625, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.4517414830625057, + "step": 77 + }, + { + "completion_length": 2217.4693298339844, + "epoch": 0.20220349967595594, + "grad_norm": 0.17001135134898285, + "kl": 0.002323150634765625, + "learning_rate": 9.720799386588358e-07, + "loss": 0.0214, + "reward": 1.0081346929073334, + "reward_std": 0.5323201268911362, + "rewards/accuracy_reward": 0.6938775479793549, + "rewards/improved_len_reward_dast": 0.3142571374773979, + "step": 78 + }, + { + "completion_length": 2039.5867309570312, + "epoch": 0.20479585223590407, + "grad_norm": 0.19848985839460778, + "kl": 0.002605438232421875, + "learning_rate": 9.706457546452898e-07, + "loss": 0.0507, + "reward": 1.1386294960975647, + "reward_std": 0.3946889452636242, + "rewards/accuracy_reward": 0.7448979541659355, + "rewards/improved_len_reward_dast": 0.3937314935028553, + "step": 79 + }, + { + "completion_length": 2590.5305786132812, + "epoch": 0.20738820479585224, + "grad_norm": 0.15129066062202914, + "kl": 0.002803802490234375, + "learning_rate": 9.691768920702379e-07, + "loss": -0.0267, + "reward": 0.8391379117965698, + "reward_std": 0.39438748359680176, + "rewards/accuracy_reward": 0.5765306055545807, + "rewards/improved_len_reward_dast": 0.26260728016495705, + "step": 80 + }, + { + "completion_length": 2176.096893310547, + "epoch": 0.2099805573558004, + "grad_norm": 0.18394525455650038, + "kl": 0.00240325927734375, + "learning_rate": 9.676734720285456e-07, + "loss": 0.0667, + "reward": 1.148956298828125, + "reward_std": 0.34060123562812805, + "rewards/accuracy_reward": 0.7448979467153549, + "rewards/improved_len_reward_dast": 0.4040583297610283, + "step": 81 + }, + { + "completion_length": 2104.994842529297, + "epoch": 0.21257290991574854, + "grad_norm": 0.1783774193001553, + "kl": 0.00263214111328125, + "learning_rate": 9.661356184640394e-07, + "loss": 0.0607, + "reward": 1.300699919462204, + "reward_std": 0.29261183738708496, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.5149856060743332, + "step": 82 + }, + { + "completion_length": 2017.9591674804688, + "epoch": 0.2151652624756967, + "grad_norm": 0.20548002392363018, + "kl": 0.003589630126953125, + "learning_rate": 9.64563458159288e-07, + "loss": 0.0372, + "reward": 1.2817473858594894, + "reward_std": 0.42862868309020996, + "rewards/accuracy_reward": 0.8265305906534195, + "rewards/improved_len_reward_dast": 0.45521679520606995, + "step": 83 + }, + { + "completion_length": 2365.132568359375, + "epoch": 0.21775761503564484, + "grad_norm": 0.2118006180262065, + "kl": 0.003673553466796875, + "learning_rate": 9.629571207251515e-07, + "loss": 0.0474, + "reward": 1.1858174800872803, + "reward_std": 0.42872869968414307, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.4256134256720543, + "step": 84 + }, + { + "completion_length": 2227.8111572265625, + "epoch": 0.220349967595593, + "grad_norm": 0.1730257242071835, + "kl": 0.0032958984375, + "learning_rate": 9.613167385900944e-07, + "loss": 0.0116, + "reward": 0.9865487962961197, + "reward_std": 0.30924591794610023, + "rewards/accuracy_reward": 0.6887754946947098, + "rewards/improved_len_reward_dast": 0.2977732727304101, + "step": 85 + }, + { + "completion_length": 2069.8213806152344, + "epoch": 0.22294232015554116, + "grad_norm": 0.1997054811852766, + "kl": 0.003353118896484375, + "learning_rate": 9.59642446989269e-07, + "loss": 0.0275, + "reward": 1.2090528905391693, + "reward_std": 0.4271962344646454, + "rewards/accuracy_reward": 0.7806122303009033, + "rewards/improved_len_reward_dast": 0.428440660238266, + "step": 86 + }, + { + "completion_length": 2234.255096435547, + "epoch": 0.2255346727154893, + "grad_norm": 0.1689278406473576, + "kl": 0.0041046142578125, + "learning_rate": 9.579343839533668e-07, + "loss": 0.0395, + "reward": 1.1342998147010803, + "reward_std": 0.3173440955579281, + "rewards/accuracy_reward": 0.739795908331871, + "rewards/improved_len_reward_dast": 0.3945038840174675, + "step": 87 + }, + { + "completion_length": 2258.3009643554688, + "epoch": 0.22812702527543746, + "grad_norm": 0.19449538540190586, + "kl": 0.004421234130859375, + "learning_rate": 9.561926902972378e-07, + "loss": 0.0785, + "reward": 1.2548484802246094, + "reward_std": 0.3709937259554863, + "rewards/accuracy_reward": 0.7755101770162582, + "rewards/improved_len_reward_dast": 0.47933831810951233, + "step": 88 + }, + { + "completion_length": 1870.6989440917969, + "epoch": 0.23071937783538563, + "grad_norm": 0.1864398126735164, + "kl": 0.0042266845703125, + "learning_rate": 9.544175096082838e-07, + "loss": 0.0646, + "reward": 1.4300118386745453, + "reward_std": 0.4286029487848282, + "rewards/accuracy_reward": 0.8928571343421936, + "rewards/improved_len_reward_dast": 0.5371547788381577, + "step": 89 + }, + { + "completion_length": 2082.653045654297, + "epoch": 0.23331173039533376, + "grad_norm": 0.17766778571294792, + "kl": 0.00475311279296875, + "learning_rate": 9.526089882346172e-07, + "loss": 0.032, + "reward": 1.1855316758155823, + "reward_std": 0.36463288590312004, + "rewards/accuracy_reward": 0.7551020085811615, + "rewards/improved_len_reward_dast": 0.4304296597838402, + "step": 90 + }, + { + "completion_length": 2117.2244262695312, + "epoch": 0.23590408295528192, + "grad_norm": 0.19874233088672905, + "kl": 0.003894805908203125, + "learning_rate": 9.507672752730001e-07, + "loss": 0.052, + "reward": 1.0779342502355576, + "reward_std": 0.45030639320611954, + "rewards/accuracy_reward": 0.734693855047226, + "rewards/improved_len_reward_dast": 0.3432403616607189, + "step": 91 + }, + { + "completion_length": 2126.6173095703125, + "epoch": 0.23849643551523006, + "grad_norm": 0.20706633281686568, + "kl": 0.004180908203125, + "learning_rate": 9.4889252255655e-07, + "loss": 0.0681, + "reward": 1.1621150970458984, + "reward_std": 0.2173718847334385, + "rewards/accuracy_reward": 0.7295918315649033, + "rewards/improved_len_reward_dast": 0.43252328783273697, + "step": 92 + }, + { + "completion_length": 2107.4692993164062, + "epoch": 0.24108878807517822, + "grad_norm": 0.18999527082233988, + "kl": 0.00507354736328125, + "learning_rate": 9.469848846422223e-07, + "loss": 0.0305, + "reward": 0.9012731686234474, + "reward_std": 0.2958849798887968, + "rewards/accuracy_reward": 0.6326530501246452, + "rewards/improved_len_reward_dast": 0.2686200775206089, + "step": 93 + }, + { + "completion_length": 2329.5662841796875, + "epoch": 0.2436811406351264, + "grad_norm": 0.17793830796024995, + "kl": 0.004726409912109375, + "learning_rate": 9.450445187980699e-07, + "loss": 0.0053, + "reward": 1.0069625079631805, + "reward_std": 0.4442039094865322, + "rewards/accuracy_reward": 0.663265272974968, + "rewards/improved_len_reward_dast": 0.3436972051858902, + "step": 94 + }, + { + "completion_length": 2371.1223754882812, + "epoch": 0.24627349319507452, + "grad_norm": 0.16551461901403783, + "kl": 0.00560760498046875, + "learning_rate": 9.430715849902774e-07, + "loss": 0.0161, + "reward": 1.1833973824977875, + "reward_std": 0.3829594776034355, + "rewards/accuracy_reward": 0.7551020309329033, + "rewards/improved_len_reward_dast": 0.4282953441143036, + "step": 95 + }, + { + "completion_length": 1950.9897766113281, + "epoch": 0.24886584575502269, + "grad_norm": 0.22225719247681372, + "kl": 0.004608154296875, + "learning_rate": 9.410662458699723e-07, + "loss": 0.0456, + "reward": 1.138383835554123, + "reward_std": 0.32722293585538864, + "rewards/accuracy_reward": 0.7142857015132904, + "rewards/improved_len_reward_dast": 0.4240981712937355, + "step": 96 + }, + { + "completion_length": 1459.1683349609375, + "epoch": 0.25145819831497085, + "grad_norm": 0.20670520181853694, + "kl": 0.00476837158203125, + "learning_rate": 9.390286667598169e-07, + "loss": 0.0546, + "reward": 1.3123253285884857, + "reward_std": 0.31760613806545734, + "rewards/accuracy_reward": 0.846938744187355, + "rewards/improved_len_reward_dast": 0.4653865396976471, + "step": 97 + }, + { + "completion_length": 1836.9029846191406, + "epoch": 0.254050550874919, + "grad_norm": 0.20386220038181252, + "kl": 0.00446319580078125, + "learning_rate": 9.369590156403784e-07, + "loss": 0.0339, + "reward": 1.3093420267105103, + "reward_std": 0.42256173491477966, + "rewards/accuracy_reward": 0.8163265138864517, + "rewards/improved_len_reward_dast": 0.49301546812057495, + "step": 98 + }, + { + "completion_length": 1921.7550354003906, + "epoch": 0.2566429034348671, + "grad_norm": 0.22385072499443348, + "kl": 0.00586700439453125, + "learning_rate": 9.348574631362808e-07, + "loss": 0.0254, + "reward": 1.369395136833191, + "reward_std": 0.292521633207798, + "rewards/accuracy_reward": 0.8367346823215485, + "rewards/improved_len_reward_dast": 0.5326604098081589, + "step": 99 + }, + { + "completion_length": 1589.2550659179688, + "epoch": 0.2592352559948153, + "grad_norm": 0.23062182502361955, + "kl": 0.003963470458984375, + "learning_rate": 9.327241825021379e-07, + "loss": 0.0939, + "reward": 1.398920476436615, + "reward_std": 0.34097858518362045, + "rewards/accuracy_reward": 0.8979591578245163, + "rewards/improved_len_reward_dast": 0.5009612441062927, + "step": 100 + }, + { + "completion_length": 1968.3979187011719, + "epoch": 0.26182760855476345, + "grad_norm": 0.19172453408443837, + "kl": 0.0052337646484375, + "learning_rate": 9.3055934960827e-07, + "loss": 0.033, + "reward": 1.2349633574485779, + "reward_std": 0.4557712897658348, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.46455518156290054, + "step": 101 + }, + { + "completion_length": 2024.6580810546875, + "epoch": 0.2644199611147116, + "grad_norm": 0.18835419471758258, + "kl": 0.00595855712890625, + "learning_rate": 9.283631429262053e-07, + "loss": -0.0018, + "reward": 1.237942174077034, + "reward_std": 0.4386955201625824, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.4522278979420662, + "step": 102 + }, + { + "completion_length": 2042.0101623535156, + "epoch": 0.2670123136746598, + "grad_norm": 0.16797444756904736, + "kl": 0.00687408447265625, + "learning_rate": 9.261357435139665e-07, + "loss": 0.0127, + "reward": 1.147979348897934, + "reward_std": 0.39860222302377224, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.3877752497792244, + "step": 103 + }, + { + "completion_length": 1771.6785278320312, + "epoch": 0.2696046662346079, + "grad_norm": 0.19397130084636785, + "kl": 0.00556182861328125, + "learning_rate": 9.238773350011437e-07, + "loss": 0.0329, + "reward": 1.3575038313865662, + "reward_std": 0.28452699072659016, + "rewards/accuracy_reward": 0.8418367356061935, + "rewards/improved_len_reward_dast": 0.5156671032309532, + "step": 104 + }, + { + "completion_length": 1984.2295532226562, + "epoch": 0.27219701879455604, + "grad_norm": 0.20491481745891912, + "kl": 0.00533294677734375, + "learning_rate": 9.215881035737557e-07, + "loss": 0.0756, + "reward": 1.3917469382286072, + "reward_std": 0.3919885456562042, + "rewards/accuracy_reward": 0.8673469126224518, + "rewards/improved_len_reward_dast": 0.5244000777602196, + "step": 105 + }, + { + "completion_length": 2123.3570861816406, + "epoch": 0.2747893713545042, + "grad_norm": 0.19107859298960242, + "kl": 0.00609588623046875, + "learning_rate": 9.192682379589017e-07, + "loss": 0.0343, + "reward": 1.3419382572174072, + "reward_std": 0.550883948802948, + "rewards/accuracy_reward": 0.8163265287876129, + "rewards/improved_len_reward_dast": 0.5256116688251495, + "step": 106 + }, + { + "completion_length": 2321.183563232422, + "epoch": 0.27738172391445237, + "grad_norm": 0.17417279176148165, + "kl": 0.00618743896484375, + "learning_rate": 9.169179294092006e-07, + "loss": 0.037, + "reward": 1.2553168833255768, + "reward_std": 0.3132058009505272, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.49001070857048035, + "step": 107 + }, + { + "completion_length": 1755.6121826171875, + "epoch": 0.27997407647440054, + "grad_norm": 0.1910812285243796, + "kl": 0.0055389404296875, + "learning_rate": 9.145373716870257e-07, + "loss": 0.0074, + "reward": 1.1911440938711166, + "reward_std": 0.47732261940836906, + "rewards/accuracy_reward": 0.8265305906534195, + "rewards/improved_len_reward_dast": 0.36461350694298744, + "step": 108 + }, + { + "completion_length": 2498.53564453125, + "epoch": 0.2825664290343487, + "grad_norm": 0.1847398357059974, + "kl": 0.0076904296875, + "learning_rate": 9.121267610485294e-07, + "loss": 0.0136, + "reward": 1.0379046350717545, + "reward_std": 0.5191724747419357, + "rewards/accuracy_reward": 0.6734693795442581, + "rewards/improved_len_reward_dast": 0.36443524062633514, + "step": 109 + }, + { + "completion_length": 1881.5408020019531, + "epoch": 0.2851587815942968, + "grad_norm": 0.1895141382280174, + "kl": 0.0063629150390625, + "learning_rate": 9.096862962274642e-07, + "loss": -0.0114, + "reward": 1.2222436666488647, + "reward_std": 0.2921589985489845, + "rewards/accuracy_reward": 0.760204054415226, + "rewards/improved_len_reward_dast": 0.4620395749807358, + "step": 110 + }, + { + "completion_length": 2229.341827392578, + "epoch": 0.28775113415424497, + "grad_norm": 0.16533064618080134, + "kl": 0.00737762451171875, + "learning_rate": 9.072161784187988e-07, + "loss": 0.029, + "reward": 1.213012382388115, + "reward_std": 0.427090298384428, + "rewards/accuracy_reward": 0.795918345451355, + "rewards/improved_len_reward_dast": 0.41709401085972786, + "step": 111 + }, + { + "completion_length": 1740.8673400878906, + "epoch": 0.29034348671419313, + "grad_norm": 0.17704874550004857, + "kl": 0.00606536865234375, + "learning_rate": 9.047166112621312e-07, + "loss": 0.0232, + "reward": 1.3144700229167938, + "reward_std": 0.3366679251194, + "rewards/accuracy_reward": 0.8163264989852905, + "rewards/improved_len_reward_dast": 0.4981435164809227, + "step": 112 + }, + { + "completion_length": 2048.397918701172, + "epoch": 0.2929358392741413, + "grad_norm": 0.19568646749424262, + "kl": 0.00690460205078125, + "learning_rate": 9.021878008249001e-07, + "loss": 0.0206, + "reward": 1.1744825094938278, + "reward_std": 0.479649193584919, + "rewards/accuracy_reward": 0.7806122303009033, + "rewards/improved_len_reward_dast": 0.3938702493906021, + "step": 113 + }, + { + "completion_length": 1883.0255126953125, + "epoch": 0.29552819183408946, + "grad_norm": 0.201863471118327, + "kl": 0.007293701171875, + "learning_rate": 8.996299555853973e-07, + "loss": 0.0263, + "reward": 1.3593637347221375, + "reward_std": 0.3963543549180031, + "rewards/accuracy_reward": 0.8418367207050323, + "rewards/improved_len_reward_dast": 0.5175270512700081, + "step": 114 + }, + { + "completion_length": 1779.4489135742188, + "epoch": 0.29812054439403757, + "grad_norm": 0.21073286141952957, + "kl": 0.00705718994140625, + "learning_rate": 8.970432864155798e-07, + "loss": 0.059, + "reward": 1.284899353981018, + "reward_std": 0.3950739651918411, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.49408305436372757, + "step": 115 + }, + { + "completion_length": 1918.2244873046875, + "epoch": 0.30071289695398573, + "grad_norm": 0.19227538961602422, + "kl": 0.00742340087890625, + "learning_rate": 8.944280065636851e-07, + "loss": 0.0454, + "reward": 1.2475728243589401, + "reward_std": 0.32171259075403214, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.4618585482239723, + "step": 116 + }, + { + "completion_length": 1858.4795532226562, + "epoch": 0.3033052495139339, + "grad_norm": 0.19238271005304078, + "kl": 0.00749969482421875, + "learning_rate": 8.917843316366515e-07, + "loss": 0.0387, + "reward": 1.364868402481079, + "reward_std": 0.2818027026951313, + "rewards/accuracy_reward": 0.8316326439380646, + "rewards/improved_len_reward_dast": 0.533235713839531, + "step": 117 + }, + { + "completion_length": 1993.6224060058594, + "epoch": 0.30589760207388206, + "grad_norm": 0.231864346111992, + "kl": 0.00769805908203125, + "learning_rate": 8.891124795823426e-07, + "loss": -0.0075, + "reward": 1.1190623342990875, + "reward_std": 0.2991497367620468, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.3282460141927004, + "step": 118 + }, + { + "completion_length": 1985.5509643554688, + "epoch": 0.3084899546338302, + "grad_norm": 0.17623896225871394, + "kl": 0.00771331787109375, + "learning_rate": 8.864126706715796e-07, + "loss": 0.0186, + "reward": 1.2160087823867798, + "reward_std": 0.35445018485188484, + "rewards/accuracy_reward": 0.7448979467153549, + "rewards/improved_len_reward_dast": 0.4711107425391674, + "step": 119 + }, + { + "completion_length": 2125.1376953125, + "epoch": 0.31108230719377833, + "grad_norm": 0.2263640313290784, + "kl": 0.0087432861328125, + "learning_rate": 8.83685127479982e-07, + "loss": 0.0941, + "reward": 1.281501442193985, + "reward_std": 0.38218285515904427, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.5110933035612106, + "step": 120 + }, + { + "completion_length": 1814.5611877441406, + "epoch": 0.3136746597537265, + "grad_norm": 0.19715675281839773, + "kl": 0.007568359375, + "learning_rate": 8.809300748696173e-07, + "loss": 0.0386, + "reward": 1.1133249253034592, + "reward_std": 0.3796735033392906, + "rewards/accuracy_reward": 0.7295918315649033, + "rewards/improved_len_reward_dast": 0.38373304158449173, + "step": 121 + }, + { + "completion_length": 2427.4489135742188, + "epoch": 0.31626701231367466, + "grad_norm": 0.16760355775672944, + "kl": 0.00905609130859375, + "learning_rate": 8.781477399704652e-07, + "loss": 0.0048, + "reward": 1.0130163729190826, + "reward_std": 0.4051677845418453, + "rewards/accuracy_reward": 0.6632652878761292, + "rewards/improved_len_reward_dast": 0.349751066416502, + "step": 122 + }, + { + "completion_length": 2251.3570861816406, + "epoch": 0.3188593648736228, + "grad_norm": 0.1882544168870131, + "kl": 0.00846099853515625, + "learning_rate": 8.753383521616902e-07, + "loss": 0.0008, + "reward": 1.1944599151611328, + "reward_std": 0.4080551564693451, + "rewards/accuracy_reward": 0.7499999850988388, + "rewards/improved_len_reward_dast": 0.4444599226117134, + "step": 123 + }, + { + "completion_length": 1852.142822265625, + "epoch": 0.321451717433571, + "grad_norm": 0.22567456549295617, + "kl": 0.007122039794921875, + "learning_rate": 8.72502143052733e-07, + "loss": 0.0421, + "reward": 1.0371171534061432, + "reward_std": 0.4070936441421509, + "rewards/accuracy_reward": 0.6887754946947098, + "rewards/improved_len_reward_dast": 0.34834159165620804, + "step": 124 + }, + { + "completion_length": 1902.4897766113281, + "epoch": 0.32404406999351915, + "grad_norm": 0.18976500768952323, + "kl": 0.00728607177734375, + "learning_rate": 8.696393464642158e-07, + "loss": -0.0168, + "reward": 1.379349261522293, + "reward_std": 0.34975893795490265, + "rewards/accuracy_reward": 0.8469387590885162, + "rewards/improved_len_reward_dast": 0.5324105769395828, + "step": 125 + }, + { + "completion_length": 1687.3979187011719, + "epoch": 0.32663642255346725, + "grad_norm": 0.1842833719422884, + "kl": 0.00609588623046875, + "learning_rate": 8.667501984086655e-07, + "loss": 0.0248, + "reward": 1.3401367366313934, + "reward_std": 0.26001402735710144, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.5544224381446838, + "step": 126 + }, + { + "completion_length": 1719.23974609375, + "epoch": 0.3292287751134154, + "grad_norm": 0.2122526031093734, + "kl": 0.00665283203125, + "learning_rate": 8.638349370710573e-07, + "loss": 0.0493, + "reward": 1.2587095499038696, + "reward_std": 0.30533889308571815, + "rewards/accuracy_reward": 0.8163264989852905, + "rewards/improved_len_reward_dast": 0.4423830099403858, + "step": 127 + }, + { + "completion_length": 1702.78564453125, + "epoch": 0.3318211276733636, + "grad_norm": 0.18811783070011717, + "kl": 0.00623321533203125, + "learning_rate": 8.608938027891775e-07, + "loss": 0.0049, + "reward": 1.3044427931308746, + "reward_std": 0.47574885934591293, + "rewards/accuracy_reward": 0.806122437119484, + "rewards/improved_len_reward_dast": 0.49832039326429367, + "step": 128 + }, + { + "completion_length": 1589.6376953125, + "epoch": 0.33441348023331174, + "grad_norm": 0.2122723729405287, + "kl": 0.007274627685546875, + "learning_rate": 8.579270380338107e-07, + "loss": 0.0378, + "reward": 1.3573221862316132, + "reward_std": 0.40166376531124115, + "rewards/accuracy_reward": 0.8469387590885162, + "rewards/improved_len_reward_dast": 0.510383352637291, + "step": 129 + }, + { + "completion_length": 2209.2244873046875, + "epoch": 0.3370058327932599, + "grad_norm": 0.18766107651382932, + "kl": 0.0082550048828125, + "learning_rate": 8.549348873887496e-07, + "loss": -0.035, + "reward": 0.9989715814590454, + "reward_std": 0.4630734659731388, + "rewards/accuracy_reward": 0.6734693646430969, + "rewards/improved_len_reward_dast": 0.32550226897001266, + "step": 130 + }, + { + "completion_length": 1750.2499694824219, + "epoch": 0.339598185353208, + "grad_norm": 0.26668844455154506, + "kl": 0.0062713623046875, + "learning_rate": 8.519175975306312e-07, + "loss": 0.0733, + "reward": 1.0193718448281288, + "reward_std": 0.49021392315626144, + "rewards/accuracy_reward": 0.6989795863628387, + "rewards/improved_len_reward_dast": 0.3203922025859356, + "step": 131 + }, + { + "completion_length": 1834.892822265625, + "epoch": 0.3421905379131562, + "grad_norm": 0.17123158557193757, + "kl": 0.006275177001953125, + "learning_rate": 8.48875417208601e-07, + "loss": 0.0191, + "reward": 1.2724904865026474, + "reward_std": 0.36864253878593445, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.5020823329687119, + "step": 132 + }, + { + "completion_length": 1844.9081115722656, + "epoch": 0.34478289047310434, + "grad_norm": 0.1744110793812119, + "kl": 0.00693511962890625, + "learning_rate": 8.458085972238048e-07, + "loss": 0.0332, + "reward": 1.0728662610054016, + "reward_std": 0.4644254148006439, + "rewards/accuracy_reward": 0.7499999850988388, + "rewards/improved_len_reward_dast": 0.3228662498295307, + "step": 133 + }, + { + "completion_length": 1910.1427917480469, + "epoch": 0.3473752430330525, + "grad_norm": 0.22282630764089068, + "kl": 0.0084686279296875, + "learning_rate": 8.427173904087138e-07, + "loss": 0.0291, + "reward": 1.1172972619533539, + "reward_std": 0.3814988359808922, + "rewards/accuracy_reward": 0.7551020085811615, + "rewards/improved_len_reward_dast": 0.36219523288309574, + "step": 134 + }, + { + "completion_length": 2461.3775329589844, + "epoch": 0.34996759559300067, + "grad_norm": 0.1595488734110434, + "kl": 0.0104522705078125, + "learning_rate": 8.396020516062794e-07, + "loss": -0.0068, + "reward": 0.9715078249573708, + "reward_std": 0.3740999586880207, + "rewards/accuracy_reward": 0.6173469200730324, + "rewards/improved_len_reward_dast": 0.3541608899831772, + "step": 135 + }, + { + "completion_length": 1467.096908569336, + "epoch": 0.3525599481529488, + "grad_norm": 0.17905275908990426, + "kl": 0.005458831787109375, + "learning_rate": 8.364628376489242e-07, + "loss": 0.0333, + "reward": 1.558873325586319, + "reward_std": 0.29448162391781807, + "rewards/accuracy_reward": 0.928571417927742, + "rewards/improved_len_reward_dast": 0.6303019374608994, + "step": 136 + }, + { + "completion_length": 1310.5, + "epoch": 0.35515230071289694, + "grad_norm": 0.20951329036509847, + "kl": 0.0060577392578125, + "learning_rate": 8.333000073373685e-07, + "loss": -0.0166, + "reward": 1.2859368920326233, + "reward_std": 0.3338315784931183, + "rewards/accuracy_reward": 0.8061224520206451, + "rewards/improved_len_reward_dast": 0.47981445118784904, + "step": 137 + }, + { + "completion_length": 1815.6122436523438, + "epoch": 0.3577446532728451, + "grad_norm": 0.19604752185803775, + "kl": 0.0070953369140625, + "learning_rate": 8.301138214192945e-07, + "loss": 0.0433, + "reward": 1.2342120856046677, + "reward_std": 0.4501468688249588, + "rewards/accuracy_reward": 0.8010203987360001, + "rewards/improved_len_reward_dast": 0.4331916607916355, + "step": 138 + }, + { + "completion_length": 1862.0764770507812, + "epoch": 0.36033700583279327, + "grad_norm": 0.18709921475186367, + "kl": 0.0084228515625, + "learning_rate": 8.269045425678497e-07, + "loss": -0.011, + "reward": 1.2167351096868515, + "reward_std": 0.3770736940205097, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.45653103291988373, + "step": 139 + }, + { + "completion_length": 1736.1376953125, + "epoch": 0.36292935839274143, + "grad_norm": 0.19354018571685683, + "kl": 0.0071258544921875, + "learning_rate": 8.236724353599918e-07, + "loss": 0.041, + "reward": 1.496632605791092, + "reward_std": 0.3335278294980526, + "rewards/accuracy_reward": 0.8979591578245163, + "rewards/improved_len_reward_dast": 0.5986734926700592, + "step": 140 + }, + { + "completion_length": 1628.4183654785156, + "epoch": 0.36552171095268954, + "grad_norm": 0.16803171468726585, + "kl": 0.00705718994140625, + "learning_rate": 8.204177662546763e-07, + "loss": -0.0198, + "reward": 1.2802585661411285, + "reward_std": 0.3480174820870161, + "rewards/accuracy_reward": 0.8163265138864517, + "rewards/improved_len_reward_dast": 0.46393200755119324, + "step": 141 + }, + { + "completion_length": 1563.2244567871094, + "epoch": 0.3681140635126377, + "grad_norm": 0.21830948983629073, + "kl": 0.006256103515625, + "learning_rate": 8.171408035708906e-07, + "loss": 0.0147, + "reward": 1.477361023426056, + "reward_std": 0.36876992136240005, + "rewards/accuracy_reward": 0.8622448742389679, + "rewards/improved_len_reward_dast": 0.6151161342859268, + "step": 142 + }, + { + "completion_length": 1426.9744567871094, + "epoch": 0.37070641607258586, + "grad_norm": 0.1829469047156503, + "kl": 0.005870819091796875, + "learning_rate": 8.138418174655323e-07, + "loss": -0.0128, + "reward": 1.475436508655548, + "reward_std": 0.28024090081453323, + "rewards/accuracy_reward": 0.8877550959587097, + "rewards/improved_len_reward_dast": 0.5876814350485802, + "step": 143 + }, + { + "completion_length": 2269.73974609375, + "epoch": 0.37329876863253403, + "grad_norm": 0.15370768982629232, + "kl": 0.00823974609375, + "learning_rate": 8.105210799111366e-07, + "loss": 0.029, + "reward": 1.0333527326583862, + "reward_std": 0.4238397367298603, + "rewards/accuracy_reward": 0.6632652878761292, + "rewards/improved_len_reward_dast": 0.37008739449083805, + "step": 144 + }, + { + "completion_length": 1661.2142333984375, + "epoch": 0.3758911211924822, + "grad_norm": 0.1756144937263373, + "kl": 0.006439208984375, + "learning_rate": 8.071788646734564e-07, + "loss": 0.0278, + "reward": 1.297868698835373, + "reward_std": 0.30791742727160454, + "rewards/accuracy_reward": 0.8163265138864517, + "rewards/improved_len_reward_dast": 0.4815421551465988, + "step": 145 + }, + { + "completion_length": 1629.2754516601562, + "epoch": 0.37848347375243035, + "grad_norm": 0.19753853796416515, + "kl": 0.006805419921875, + "learning_rate": 8.038154472888909e-07, + "loss": -0.0047, + "reward": 1.2643596529960632, + "reward_std": 0.403556901961565, + "rewards/accuracy_reward": 0.806122437119484, + "rewards/improved_len_reward_dast": 0.45823724940419197, + "step": 146 + }, + { + "completion_length": 1698.1785278320312, + "epoch": 0.38107582631237846, + "grad_norm": 0.18090958864036752, + "kl": 0.00759124755859375, + "learning_rate": 8.004311050417711e-07, + "loss": -0.0063, + "reward": 1.2380123734474182, + "reward_std": 0.39292842149734497, + "rewards/accuracy_reward": 0.7806122153997421, + "rewards/improved_len_reward_dast": 0.4574001543223858, + "step": 147 + }, + { + "completion_length": 1603.7703704833984, + "epoch": 0.3836681788723266, + "grad_norm": 0.1689548990240542, + "kl": 0.00655364990234375, + "learning_rate": 7.970261169414999e-07, + "loss": 0.0034, + "reward": 1.2632354497909546, + "reward_std": 0.42876998893916607, + "rewards/accuracy_reward": 0.8010203838348389, + "rewards/improved_len_reward_dast": 0.46221502870321274, + "step": 148 + }, + { + "completion_length": 2111.928497314453, + "epoch": 0.3862605314322748, + "grad_norm": 0.23403462014206552, + "kl": 0.00902557373046875, + "learning_rate": 7.936007636995497e-07, + "loss": 0.0581, + "reward": 1.1535758823156357, + "reward_std": 0.33541079610586166, + "rewards/accuracy_reward": 0.7091836556792259, + "rewards/improved_len_reward_dast": 0.44439224898815155, + "step": 149 + }, + { + "completion_length": 1584.5560760498047, + "epoch": 0.38885288399222295, + "grad_norm": 0.19966714442908384, + "kl": 0.00608062744140625, + "learning_rate": 7.901553277063213e-07, + "loss": -0.0136, + "reward": 1.0925945341587067, + "reward_std": 0.4660287909209728, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.3323905020952225, + "step": 150 + }, + { + "completion_length": 1963.030502319336, + "epoch": 0.3914452365521711, + "grad_norm": 0.17996728024183786, + "kl": 0.0086822509765625, + "learning_rate": 7.866900930078618e-07, + "loss": 0.0058, + "reward": 1.245696559548378, + "reward_std": 0.4446266293525696, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.4854924902319908, + "step": 151 + }, + { + "completion_length": 1893.0254821777344, + "epoch": 0.3940375891121192, + "grad_norm": 0.16735022993158205, + "kl": 0.007110595703125, + "learning_rate": 7.832053452824489e-07, + "loss": 0.0104, + "reward": 1.2418105602264404, + "reward_std": 0.4090575650334358, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.4714023545384407, + "step": 152 + }, + { + "completion_length": 1724.3111572265625, + "epoch": 0.3966299416720674, + "grad_norm": 0.1864010620729168, + "kl": 0.00872802734375, + "learning_rate": 7.797013718170384e-07, + "loss": 0.0296, + "reward": 1.1897482573986053, + "reward_std": 0.3867075741291046, + "rewards/accuracy_reward": 0.7755101919174194, + "rewards/improved_len_reward_dast": 0.4142380841076374, + "step": 153 + }, + { + "completion_length": 1520.3673553466797, + "epoch": 0.39922229423201555, + "grad_norm": 0.19558753420229233, + "kl": 0.006317138671875, + "learning_rate": 7.761784614835801e-07, + "loss": -0.0009, + "reward": 1.1826948821544647, + "reward_std": 0.44549785554409027, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.3969806134700775, + "step": 154 + }, + { + "completion_length": 1902.83154296875, + "epoch": 0.4018146467919637, + "grad_norm": 0.1628442801355898, + "kl": 0.007907867431640625, + "learning_rate": 7.726369047152029e-07, + "loss": 0.0111, + "reward": 1.1829434633255005, + "reward_std": 0.4352233223617077, + "rewards/accuracy_reward": 0.7346938699483871, + "rewards/improved_len_reward_dast": 0.44824954867362976, + "step": 155 + }, + { + "completion_length": 1687.5867004394531, + "epoch": 0.4044069993519119, + "grad_norm": 0.15254799874290897, + "kl": 0.0055694580078125, + "learning_rate": 7.690769934822712e-07, + "loss": 0.0209, + "reward": 1.3427188694477081, + "reward_std": 0.39824075251817703, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.5212903171777725, + "step": 156 + }, + { + "completion_length": 1699.2857055664062, + "epoch": 0.40699935191186, + "grad_norm": 0.17162045711276386, + "kl": 0.00756072998046875, + "learning_rate": 7.654990212683142e-07, + "loss": 0.0029, + "reward": 1.3672717213630676, + "reward_std": 0.34800875186920166, + "rewards/accuracy_reward": 0.8520407974720001, + "rewards/improved_len_reward_dast": 0.5152308940887451, + "step": 157 + }, + { + "completion_length": 1642.4897766113281, + "epoch": 0.40959170447180815, + "grad_norm": 0.17781118941038052, + "kl": 0.0069427490234375, + "learning_rate": 7.619032830458307e-07, + "loss": 0.0238, + "reward": 1.36138716340065, + "reward_std": 0.42799485474824905, + "rewards/accuracy_reward": 0.8520407974720001, + "rewards/improved_len_reward_dast": 0.5093463957309723, + "step": 158 + }, + { + "completion_length": 2058.10205078125, + "epoch": 0.4121840570317563, + "grad_norm": 0.21486100887413462, + "kl": 0.00844573974609375, + "learning_rate": 7.582900752519723e-07, + "loss": 0.052, + "reward": 1.2367046475410461, + "reward_std": 0.4686100408434868, + "rewards/accuracy_reward": 0.7857142835855484, + "rewards/improved_len_reward_dast": 0.45099035650491714, + "step": 159 + }, + { + "completion_length": 2116.7601928710938, + "epoch": 0.4147764095917045, + "grad_norm": 0.21872883985010524, + "kl": 0.00928497314453125, + "learning_rate": 7.546596957641031e-07, + "loss": 0.0469, + "reward": 1.1451009958982468, + "reward_std": 0.2814931422472, + "rewards/accuracy_reward": 0.7244897782802582, + "rewards/improved_len_reward_dast": 0.4206111915409565, + "step": 160 + }, + { + "completion_length": 2057.1172790527344, + "epoch": 0.41736876215165264, + "grad_norm": 0.223277485058984, + "kl": 0.0099639892578125, + "learning_rate": 7.510124438752432e-07, + "loss": 0.0282, + "reward": 1.2358856201171875, + "reward_std": 0.42381204664707184, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.4501713886857033, + "step": 161 + }, + { + "completion_length": 1648.7907409667969, + "epoch": 0.4199611147116008, + "grad_norm": 0.19361427922643096, + "kl": 0.007965087890625, + "learning_rate": 7.473486202693949e-07, + "loss": 0.0283, + "reward": 1.5626276433467865, + "reward_std": 0.33783891052007675, + "rewards/accuracy_reward": 0.9081632643938065, + "rewards/improved_len_reward_dast": 0.6544643938541412, + "step": 162 + }, + { + "completion_length": 1720.7805938720703, + "epoch": 0.4225534672715489, + "grad_norm": 0.22042630118078563, + "kl": 0.008636474609375, + "learning_rate": 7.43668526996753e-07, + "loss": 0.0517, + "reward": 1.203346148133278, + "reward_std": 0.48596539348363876, + "rewards/accuracy_reward": 0.7704081386327744, + "rewards/improved_len_reward_dast": 0.43293796479701996, + "step": 163 + }, + { + "completion_length": 1918.5816345214844, + "epoch": 0.4251458198314971, + "grad_norm": 0.20825217508460148, + "kl": 0.0105438232421875, + "learning_rate": 7.399724674488046e-07, + "loss": 0.0313, + "reward": 1.2619640827178955, + "reward_std": 0.3394176550209522, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.49665799736976624, + "step": 164 + }, + { + "completion_length": 1879.0867004394531, + "epoch": 0.42773817239144524, + "grad_norm": 0.20859456410748778, + "kl": 0.00949859619140625, + "learning_rate": 7.36260746333316e-07, + "loss": 0.1032, + "reward": 1.250516802072525, + "reward_std": 0.21495914831757545, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.48521073907613754, + "step": 165 + }, + { + "completion_length": 1788.2040405273438, + "epoch": 0.4303305249513934, + "grad_norm": 0.19365279193672524, + "kl": 0.00925445556640625, + "learning_rate": 7.325336696492128e-07, + "loss": 0.031, + "reward": 1.3934488892555237, + "reward_std": 0.3679058402776718, + "rewards/accuracy_reward": 0.867346927523613, + "rewards/improved_len_reward_dast": 0.5261020287871361, + "step": 166 + }, + { + "completion_length": 2040.7346801757812, + "epoch": 0.43292287751134156, + "grad_norm": 0.1746728685861396, + "kl": 0.010894775390625, + "learning_rate": 7.287915446613531e-07, + "loss": 0.0021, + "reward": 1.270061433315277, + "reward_std": 0.3740099295973778, + "rewards/accuracy_reward": 0.8061224222183228, + "rewards/improved_len_reward_dast": 0.46393903344869614, + "step": 167 + }, + { + "completion_length": 2118.234649658203, + "epoch": 0.43551523007128967, + "grad_norm": 0.20129074148639173, + "kl": 0.013275146484375, + "learning_rate": 7.250346798751953e-07, + "loss": 0.006, + "reward": 0.9839373528957367, + "reward_std": 0.581517793238163, + "rewards/accuracy_reward": 0.6785714030265808, + "rewards/improved_len_reward_dast": 0.3053659498691559, + "step": 168 + }, + { + "completion_length": 1795.9540252685547, + "epoch": 0.43810758263123784, + "grad_norm": 0.1813953032982878, + "kl": 0.009395599365234375, + "learning_rate": 7.212633850113662e-07, + "loss": 0.0235, + "reward": 1.178409919142723, + "reward_std": 0.4242382049560547, + "rewards/accuracy_reward": 0.734693855047226, + "rewards/improved_len_reward_dast": 0.44371599704027176, + "step": 169 + }, + { + "completion_length": 1421.1734619140625, + "epoch": 0.440699935191186, + "grad_norm": 0.18794137958282095, + "kl": 0.008941650390625, + "learning_rate": 7.174779709801253e-07, + "loss": 0.0159, + "reward": 1.4234746396541595, + "reward_std": 0.32885606586933136, + "rewards/accuracy_reward": 0.8622448742389679, + "rewards/improved_len_reward_dast": 0.5612297654151917, + "step": 170 + }, + { + "completion_length": 1736.6632690429688, + "epoch": 0.44329228775113416, + "grad_norm": 0.22796049151575712, + "kl": 0.009891510009765625, + "learning_rate": 7.136787498557344e-07, + "loss": 0.0088, + "reward": 1.3514071702957153, + "reward_std": 0.40995020419359207, + "rewards/accuracy_reward": 0.846938744187355, + "rewards/improved_len_reward_dast": 0.5044683739542961, + "step": 171 + }, + { + "completion_length": 1768.7193603515625, + "epoch": 0.4458846403110823, + "grad_norm": 0.25032479837006205, + "kl": 0.010284423828125, + "learning_rate": 7.098660348507293e-07, + "loss": 0.0732, + "reward": 1.269765853881836, + "reward_std": 0.46360351890325546, + "rewards/accuracy_reward": 0.7704081386327744, + "rewards/improved_len_reward_dast": 0.4993576854467392, + "step": 172 + }, + { + "completion_length": 1956.9999694824219, + "epoch": 0.44847699287103043, + "grad_norm": 0.17507117871432235, + "kl": 0.0093231201171875, + "learning_rate": 7.060401402900977e-07, + "loss": 0.0185, + "reward": 1.1613440364599228, + "reward_std": 0.5052430480718613, + "rewards/accuracy_reward": 0.739795908331871, + "rewards/improved_len_reward_dast": 0.42154809460043907, + "step": 173 + }, + { + "completion_length": 1834.2601623535156, + "epoch": 0.4510693454309786, + "grad_norm": 0.19217203672529928, + "kl": 0.01007843017578125, + "learning_rate": 7.022013815853672e-07, + "loss": 0.0209, + "reward": 1.0959883034229279, + "reward_std": 0.47629018872976303, + "rewards/accuracy_reward": 0.7295918166637421, + "rewards/improved_len_reward_dast": 0.3663964793086052, + "step": 174 + }, + { + "completion_length": 1817.4489440917969, + "epoch": 0.45366169799092676, + "grad_norm": 0.19322905501288215, + "kl": 0.01153564453125, + "learning_rate": 6.983500752086006e-07, + "loss": 0.0448, + "reward": 1.2833284437656403, + "reward_std": 0.43457718193531036, + "rewards/accuracy_reward": 0.795918345451355, + "rewards/improved_len_reward_dast": 0.4874100536108017, + "step": 175 + }, + { + "completion_length": 1651.7244873046875, + "epoch": 0.4562540505508749, + "grad_norm": 0.19443121591302054, + "kl": 0.00969696044921875, + "learning_rate": 6.94486538666307e-07, + "loss": 0.0327, + "reward": 1.254166454076767, + "reward_std": 0.4054510071873665, + "rewards/accuracy_reward": 0.7806122452020645, + "rewards/improved_len_reward_dast": 0.47355421632528305, + "step": 176 + }, + { + "completion_length": 1690.4234313964844, + "epoch": 0.4588464031108231, + "grad_norm": 0.2099852909442493, + "kl": 0.0092010498046875, + "learning_rate": 6.906110904732656e-07, + "loss": -0.0115, + "reward": 1.3241359293460846, + "reward_std": 0.4749620705842972, + "rewards/accuracy_reward": 0.8163265138864517, + "rewards/improved_len_reward_dast": 0.5078093633055687, + "step": 177 + }, + { + "completion_length": 2150.1529541015625, + "epoch": 0.46143875567077125, + "grad_norm": 0.16262254100217993, + "kl": 0.01073455810546875, + "learning_rate": 6.867240501262666e-07, + "loss": 0.0219, + "reward": 1.3224327564239502, + "reward_std": 0.31201132386922836, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.5622286796569824, + "step": 178 + }, + { + "completion_length": 1616.73974609375, + "epoch": 0.46403110823071936, + "grad_norm": 0.2054857790671321, + "kl": 0.010406494140625, + "learning_rate": 6.828257380777723e-07, + "loss": -0.0028, + "reward": 1.2023987025022507, + "reward_std": 0.38464218378067017, + "rewards/accuracy_reward": 0.8214285671710968, + "rewards/improved_len_reward_dast": 0.38097016140818596, + "step": 179 + }, + { + "completion_length": 1939.9744567871094, + "epoch": 0.4666234607906675, + "grad_norm": 0.18969129476831767, + "kl": 0.0137481689453125, + "learning_rate": 6.789164757094978e-07, + "loss": 0.035, + "reward": 1.1967380195856094, + "reward_std": 0.3427240923047066, + "rewards/accuracy_reward": 0.734693855047226, + "rewards/improved_len_reward_dast": 0.4620441570878029, + "step": 180 + }, + { + "completion_length": 1848.25, + "epoch": 0.4692158133506157, + "grad_norm": 0.18668896975291646, + "kl": 0.011810302734375, + "learning_rate": 6.749965853059164e-07, + "loss": 0.0536, + "reward": 1.3282198309898376, + "reward_std": 0.4290488064289093, + "rewards/accuracy_reward": 0.8520407974720001, + "rewards/improved_len_reward_dast": 0.47617900371551514, + "step": 181 + }, + { + "completion_length": 1659.9489440917969, + "epoch": 0.47180816591056385, + "grad_norm": 0.2068391235436955, + "kl": 0.0099334716796875, + "learning_rate": 6.710663900276903e-07, + "loss": 0.0149, + "reward": 1.1044558137655258, + "reward_std": 0.389005184173584, + "rewards/accuracy_reward": 0.7244897931814194, + "rewards/improved_len_reward_dast": 0.37996600940823555, + "step": 182 + }, + { + "completion_length": 1548.0152893066406, + "epoch": 0.474400518470512, + "grad_norm": 0.19942963085334378, + "kl": 0.00998687744140625, + "learning_rate": 6.671262138850274e-07, + "loss": 0.0277, + "reward": 1.4036801755428314, + "reward_std": 0.325181283056736, + "rewards/accuracy_reward": 0.846938744187355, + "rewards/improved_len_reward_dast": 0.5567413941025734, + "step": 183 + }, + { + "completion_length": 1479.9234619140625, + "epoch": 0.4769928710304601, + "grad_norm": 0.17528837750916904, + "kl": 0.00907135009765625, + "learning_rate": 6.631763817109717e-07, + "loss": 0.0212, + "reward": 1.4963186979293823, + "reward_std": 0.2380654364824295, + "rewards/accuracy_reward": 0.8826530426740646, + "rewards/improved_len_reward_dast": 0.6136656627058983, + "step": 184 + }, + { + "completion_length": 1625.2856750488281, + "epoch": 0.4795852235904083, + "grad_norm": 0.2340295745334256, + "kl": 0.00994873046875, + "learning_rate": 6.592172191346218e-07, + "loss": 0.0387, + "reward": 1.3299905359745026, + "reward_std": 0.4121420457959175, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.5085620209574699, + "step": 185 + }, + { + "completion_length": 1799.586669921875, + "epoch": 0.48217757615035645, + "grad_norm": 0.208310701570096, + "kl": 0.012359619140625, + "learning_rate": 6.552490525542864e-07, + "loss": 0.0341, + "reward": 1.2161507308483124, + "reward_std": 0.3565462492406368, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.4559466913342476, + "step": 186 + }, + { + "completion_length": 1612.836685180664, + "epoch": 0.4847699287103046, + "grad_norm": 0.1767048426760215, + "kl": 0.0106048583984375, + "learning_rate": 6.512722091105757e-07, + "loss": -0.0013, + "reward": 1.3248589038848877, + "reward_std": 0.45474397391080856, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.5136343911290169, + "step": 187 + }, + { + "completion_length": 1306.5509796142578, + "epoch": 0.4873622812702528, + "grad_norm": 0.212241902185087, + "kl": 0.00981903076171875, + "learning_rate": 6.472870166594314e-07, + "loss": 0.0047, + "reward": 1.4141908586025238, + "reward_std": 0.4169772267341614, + "rewards/accuracy_reward": 0.8418367058038712, + "rewards/improved_len_reward_dast": 0.5723541006445885, + "step": 188 + }, + { + "completion_length": 1914.642822265625, + "epoch": 0.4899546338302009, + "grad_norm": 0.2520686184939368, + "kl": 0.0127410888671875, + "learning_rate": 6.432938037450974e-07, + "loss": -0.0237, + "reward": 1.1971821933984756, + "reward_std": 0.3514118604362011, + "rewards/accuracy_reward": 0.7499999850988388, + "rewards/improved_len_reward_dast": 0.44718217849731445, + "step": 189 + }, + { + "completion_length": 1808.9183349609375, + "epoch": 0.49254698639014904, + "grad_norm": 0.2130749709969565, + "kl": 0.01201629638671875, + "learning_rate": 6.392928995730352e-07, + "loss": 0.0412, + "reward": 1.2710473388433456, + "reward_std": 0.3865230418741703, + "rewards/accuracy_reward": 0.7908163219690323, + "rewards/improved_len_reward_dast": 0.48023101314902306, + "step": 190 + }, + { + "completion_length": 1365.4795837402344, + "epoch": 0.4951393389500972, + "grad_norm": 0.250237755024117, + "kl": 0.00952911376953125, + "learning_rate": 6.352846339827826e-07, + "loss": 0.095, + "reward": 1.5109961926937103, + "reward_std": 0.30784352123737335, + "rewards/accuracy_reward": 0.9132653027772903, + "rewards/improved_len_reward_dast": 0.5977308824658394, + "step": 191 + }, + { + "completion_length": 1425.2755126953125, + "epoch": 0.49773169151004537, + "grad_norm": 0.22368363257945995, + "kl": 0.0114288330078125, + "learning_rate": 6.312693374207627e-07, + "loss": 0.0195, + "reward": 1.2838004529476166, + "reward_std": 0.46850764751434326, + "rewards/accuracy_reward": 0.8265306055545807, + "rewards/improved_len_reward_dast": 0.4572698399424553, + "step": 192 + }, + { + "completion_length": 1588.5101623535156, + "epoch": 0.5003240440699935, + "grad_norm": 0.20204139731047027, + "kl": 0.01300048828125, + "learning_rate": 6.272473409130397e-07, + "loss": 0.0012, + "reward": 1.3159003108739853, + "reward_std": 0.4093224108219147, + "rewards/accuracy_reward": 0.8316326439380646, + "rewards/improved_len_reward_dast": 0.484267670661211, + "step": 193 + }, + { + "completion_length": 1411.3571166992188, + "epoch": 0.5029163966299417, + "grad_norm": 0.19443397701968118, + "kl": 0.00821685791015625, + "learning_rate": 6.232189760380301e-07, + "loss": 0.0224, + "reward": 1.288124531507492, + "reward_std": 0.3209230378270149, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.5024102553725243, + "step": 194 + }, + { + "completion_length": 1751.6785278320312, + "epoch": 0.5055087491898899, + "grad_norm": 0.18304814418314927, + "kl": 0.0109100341796875, + "learning_rate": 6.191845748991671e-07, + "loss": -0.007, + "reward": 1.0736610293388367, + "reward_std": 0.32857421785593033, + "rewards/accuracy_reward": 0.6581632494926453, + "rewards/improved_len_reward_dast": 0.41549770161509514, + "step": 195 + }, + { + "completion_length": 1771.5968933105469, + "epoch": 0.508101101749838, + "grad_norm": 0.20612952277089522, + "kl": 0.0137939453125, + "learning_rate": 6.151444700975203e-07, + "loss": 0.0106, + "reward": 1.360820233821869, + "reward_std": 0.38221075385808945, + "rewards/accuracy_reward": 0.8418367207050323, + "rewards/improved_len_reward_dast": 0.518983505666256, + "step": 196 + }, + { + "completion_length": 2076.3060913085938, + "epoch": 0.5106934543097861, + "grad_norm": 0.22320859434163112, + "kl": 0.0132293701171875, + "learning_rate": 6.110989947043767e-07, + "loss": 0.0519, + "reward": 1.101119041442871, + "reward_std": 0.4651700109243393, + "rewards/accuracy_reward": 0.7244897931814194, + "rewards/improved_len_reward_dast": 0.37662921100854874, + "step": 197 + }, + { + "completion_length": 1513.6530151367188, + "epoch": 0.5132858068697342, + "grad_norm": 0.24160481879222073, + "kl": 0.0120849609375, + "learning_rate": 6.070484822337816e-07, + "loss": 0.0617, + "reward": 1.3807711601257324, + "reward_std": 0.30266276001930237, + "rewards/accuracy_reward": 0.8622448742389679, + "rewards/improved_len_reward_dast": 0.5185262858867645, + "step": 198 + }, + { + "completion_length": 1659.4744262695312, + "epoch": 0.5158781594296824, + "grad_norm": 0.2860111752617934, + "kl": 0.0122528076171875, + "learning_rate": 6.029932666150431e-07, + "loss": 0.0487, + "reward": 1.27889584004879, + "reward_std": 0.40974466502666473, + "rewards/accuracy_reward": 0.8010203987360001, + "rewards/improved_len_reward_dast": 0.4778754487633705, + "step": 199 + }, + { + "completion_length": 1553.6479187011719, + "epoch": 0.5184705119896306, + "grad_norm": 0.17284042761570728, + "kl": 0.0113372802734375, + "learning_rate": 5.989336821652029e-07, + "loss": -0.0157, + "reward": 1.292808324098587, + "reward_std": 0.3536081798374653, + "rewards/accuracy_reward": 0.7755101919174194, + "rewards/improved_len_reward_dast": 0.517298124730587, + "step": 200 + }, + { + "completion_length": 1221.6734313964844, + "epoch": 0.5210628645495787, + "grad_norm": 0.20576387898105802, + "kl": 0.00975799560546875, + "learning_rate": 5.948700635614745e-07, + "loss": 0.0155, + "reward": 1.043928012251854, + "reward_std": 0.5074506774544716, + "rewards/accuracy_reward": 0.734693855047226, + "rewards/improved_len_reward_dast": 0.3092341625597328, + "step": 201 + }, + { + "completion_length": 1443.3367156982422, + "epoch": 0.5236552171095269, + "grad_norm": 0.190656293014884, + "kl": 0.01007080078125, + "learning_rate": 5.908027458136518e-07, + "loss": 0.027, + "reward": 1.5769412517547607, + "reward_std": 0.27542993798851967, + "rewards/accuracy_reward": 0.9081632494926453, + "rewards/improved_len_reward_dast": 0.6687779873609543, + "step": 202 + }, + { + "completion_length": 1383.1325988769531, + "epoch": 0.5262475696694751, + "grad_norm": 0.18700146403961007, + "kl": 0.00789642333984375, + "learning_rate": 5.867320642364916e-07, + "loss": -0.0, + "reward": 1.4069096446037292, + "reward_std": 0.452865906059742, + "rewards/accuracy_reward": 0.8571428507566452, + "rewards/improved_len_reward_dast": 0.5497667863965034, + "step": 203 + }, + { + "completion_length": 1636.7448425292969, + "epoch": 0.5288399222294232, + "grad_norm": 0.18621798443065538, + "kl": 0.01001739501953125, + "learning_rate": 5.826583544220678e-07, + "loss": 0.0023, + "reward": 1.1149714589118958, + "reward_std": 0.5129830092191696, + "rewards/accuracy_reward": 0.739795908331871, + "rewards/improved_len_reward_dast": 0.3751755505800247, + "step": 204 + }, + { + "completion_length": 1296.4540252685547, + "epoch": 0.5314322747893714, + "grad_norm": 0.24973009441281563, + "kl": 0.00960540771484375, + "learning_rate": 5.78581952212107e-07, + "loss": 0.057, + "reward": 1.439581423997879, + "reward_std": 0.20332731679081917, + "rewards/accuracy_reward": 0.8775510191917419, + "rewards/improved_len_reward_dast": 0.5620303899049759, + "step": 205 + }, + { + "completion_length": 1675.2040405273438, + "epoch": 0.5340246273493195, + "grad_norm": 0.17994542833868402, + "kl": 0.0113983154296875, + "learning_rate": 5.745031936702997e-07, + "loss": 0.0212, + "reward": 1.236918032169342, + "reward_std": 0.4141309931874275, + "rewards/accuracy_reward": 0.7755101919174194, + "rewards/improved_len_reward_dast": 0.46140778064727783, + "step": 206 + }, + { + "completion_length": 1685.6376953125, + "epoch": 0.5366169799092677, + "grad_norm": 0.19387833193950482, + "kl": 0.0142364501953125, + "learning_rate": 5.704224150545956e-07, + "loss": 0.0032, + "reward": 1.1570499688386917, + "reward_std": 0.4146932289004326, + "rewards/accuracy_reward": 0.739795908331871, + "rewards/improved_len_reward_dast": 0.4172540530562401, + "step": 207 + }, + { + "completion_length": 1249.0101928710938, + "epoch": 0.5392093324692158, + "grad_norm": 0.1923070203823955, + "kl": 0.0085906982421875, + "learning_rate": 5.663399527894816e-07, + "loss": 0.0138, + "reward": 1.4272409826517105, + "reward_std": 0.34243838489055634, + "rewards/accuracy_reward": 0.8622448742389679, + "rewards/improved_len_reward_dast": 0.5649960786104202, + "step": 208 + }, + { + "completion_length": 1525.1734313964844, + "epoch": 0.5418016850291639, + "grad_norm": 0.19609225255735566, + "kl": 0.01036834716796875, + "learning_rate": 5.622561434382467e-07, + "loss": 0.0011, + "reward": 1.1873522847890854, + "reward_std": 0.4918947294354439, + "rewards/accuracy_reward": 0.8010203838348389, + "rewards/improved_len_reward_dast": 0.386331919580698, + "step": 209 + }, + { + "completion_length": 1988.4591064453125, + "epoch": 0.5443940375891121, + "grad_norm": 0.2322805815292897, + "kl": 0.0143280029296875, + "learning_rate": 5.581713236752361e-07, + "loss": 0.0289, + "reward": 1.1922202408313751, + "reward_std": 0.2860515546053648, + "rewards/accuracy_reward": 0.7244897782802582, + "rewards/improved_len_reward_dast": 0.46773041412234306, + "step": 210 + }, + { + "completion_length": 1433.290771484375, + "epoch": 0.5469863901490603, + "grad_norm": 0.2984688713886969, + "kl": 0.0114898681640625, + "learning_rate": 5.540858302580934e-07, + "loss": 0.0818, + "reward": 1.3492214977741241, + "reward_std": 0.3557019531726837, + "rewards/accuracy_reward": 0.8622448742389679, + "rewards/improved_len_reward_dast": 0.48697663098573685, + "step": 211 + }, + { + "completion_length": 1686.086669921875, + "epoch": 0.5495787427090084, + "grad_norm": 0.17323504261296585, + "kl": 0.01081085205078125, + "learning_rate": 5.5e-07, + "loss": -0.0227, + "reward": 0.910240039229393, + "reward_std": 0.49440842866897583, + "rewards/accuracy_reward": 0.6632653027772903, + "rewards/improved_len_reward_dast": 0.24697477743029594, + "step": 212 + }, + { + "completion_length": 1503.3571166992188, + "epoch": 0.5521710952689566, + "grad_norm": 0.19940687047680583, + "kl": 0.0108795166015625, + "learning_rate": 5.459141697419066e-07, + "loss": 0.0196, + "reward": 1.414816826581955, + "reward_std": 0.24907327815890312, + "rewards/accuracy_reward": 0.8622448742389679, + "rewards/improved_len_reward_dast": 0.5525719411671162, + "step": 213 + }, + { + "completion_length": 1326.4744720458984, + "epoch": 0.5547634478289047, + "grad_norm": 0.1968213437884411, + "kl": 0.00897216796875, + "learning_rate": 5.418286763247641e-07, + "loss": 0.0333, + "reward": 1.5710687637329102, + "reward_std": 0.27853039279580116, + "rewards/accuracy_reward": 0.9336734712123871, + "rewards/improved_len_reward_dast": 0.6373953074216843, + "step": 214 + }, + { + "completion_length": 1814.7856750488281, + "epoch": 0.5573558003888529, + "grad_norm": 0.1910754560182501, + "kl": 0.0157623291015625, + "learning_rate": 5.377438565617532e-07, + "loss": 0.0053, + "reward": 1.1130409240722656, + "reward_std": 0.5712603330612183, + "rewards/accuracy_reward": 0.7091836780309677, + "rewards/improved_len_reward_dast": 0.4038572832942009, + "step": 215 + }, + { + "completion_length": 2041.4693603515625, + "epoch": 0.5599481529488011, + "grad_norm": 0.19528431114703992, + "kl": 0.017974853515625, + "learning_rate": 5.336600472105186e-07, + "loss": 0.0026, + "reward": 1.1326239556074142, + "reward_std": 0.5115986987948418, + "rewards/accuracy_reward": 0.7193877249956131, + "rewards/improved_len_reward_dast": 0.41323617100715637, + "step": 216 + }, + { + "completion_length": 1490.438720703125, + "epoch": 0.5625405055087492, + "grad_norm": 0.1818395863982982, + "kl": 0.011444091796875, + "learning_rate": 5.295775849454045e-07, + "loss": -0.025, + "reward": 1.1338547468185425, + "reward_std": 0.26832524314522743, + "rewards/accuracy_reward": 0.75, + "rewards/improved_len_reward_dast": 0.3838547393679619, + "step": 217 + }, + { + "completion_length": 1993.8571166992188, + "epoch": 0.5651328580686974, + "grad_norm": 0.23754078779498058, + "kl": 0.0171356201171875, + "learning_rate": 5.254968063297003e-07, + "loss": -0.0245, + "reward": 1.088214099407196, + "reward_std": 0.33989886194467545, + "rewards/accuracy_reward": 0.6938775330781937, + "rewards/improved_len_reward_dast": 0.3943365402519703, + "step": 218 + }, + { + "completion_length": 1916.8775024414062, + "epoch": 0.5677252106286454, + "grad_norm": 0.23169329147427764, + "kl": 0.0146942138671875, + "learning_rate": 5.214180477878931e-07, + "loss": -0.0216, + "reward": 1.1535532772541046, + "reward_std": 0.5523173958063126, + "rewards/accuracy_reward": 0.739795908331871, + "rewards/improved_len_reward_dast": 0.4137573465704918, + "step": 219 + }, + { + "completion_length": 2072.586700439453, + "epoch": 0.5703175631885936, + "grad_norm": 0.179237513002948, + "kl": 0.0157623291015625, + "learning_rate": 5.173416455779323e-07, + "loss": 0.0061, + "reward": 1.129465639591217, + "reward_std": 0.47254087403416634, + "rewards/accuracy_reward": 0.7397958934307098, + "rewards/improved_len_reward_dast": 0.3896697536110878, + "step": 220 + }, + { + "completion_length": 1500.7499694824219, + "epoch": 0.5729099157485418, + "grad_norm": 0.18878843129064268, + "kl": 0.01107025146484375, + "learning_rate": 5.132679357635086e-07, + "loss": -0.0142, + "reward": 1.1763963997364044, + "reward_std": 0.48718392848968506, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.40598829090595245, + "step": 221 + }, + { + "completion_length": 1644.9030151367188, + "epoch": 0.5755022683084899, + "grad_norm": 0.17742073908553643, + "kl": 0.0126495361328125, + "learning_rate": 5.091972541863481e-07, + "loss": 0.0186, + "reward": 1.1986051201820374, + "reward_std": 0.4172977935522795, + "rewards/accuracy_reward": 0.734693855047226, + "rewards/improved_len_reward_dast": 0.463911272585392, + "step": 222 + }, + { + "completion_length": 1161.091812133789, + "epoch": 0.5780946208684381, + "grad_norm": 0.189357723748229, + "kl": 0.00917816162109375, + "learning_rate": 5.051299364385257e-07, + "loss": 0.0034, + "reward": 1.5119259655475616, + "reward_std": 0.34742674231529236, + "rewards/accuracy_reward": 0.9030611962080002, + "rewards/improved_len_reward_dast": 0.6088647544384003, + "step": 223 + }, + { + "completion_length": 2160.7142944335938, + "epoch": 0.5806869734283863, + "grad_norm": 0.1958816052872559, + "kl": 0.0196075439453125, + "learning_rate": 5.010663178347971e-07, + "loss": 0.0345, + "reward": 1.2357909381389618, + "reward_std": 0.4518684595823288, + "rewards/accuracy_reward": 0.7448979318141937, + "rewards/improved_len_reward_dast": 0.4908929914236069, + "step": 224 + }, + { + "completion_length": 1368.7703552246094, + "epoch": 0.5832793259883344, + "grad_norm": 0.2126816864157868, + "kl": 0.01153564453125, + "learning_rate": 4.970067333849568e-07, + "loss": 0.0421, + "reward": 1.3800954520702362, + "reward_std": 0.24764511361718178, + "rewards/accuracy_reward": 0.8163265287876129, + "rewards/improved_len_reward_dast": 0.5637688413262367, + "step": 225 + }, + { + "completion_length": 1523.7958984375, + "epoch": 0.5858716785482826, + "grad_norm": 0.2103498219912096, + "kl": 0.013336181640625, + "learning_rate": 4.929515177662182e-07, + "loss": 0.0336, + "reward": 1.3088043332099915, + "reward_std": 0.3938099816441536, + "rewards/accuracy_reward": 0.8214285671710968, + "rewards/improved_len_reward_dast": 0.48737573623657227, + "step": 226 + }, + { + "completion_length": 1753.9897766113281, + "epoch": 0.5884640311082308, + "grad_norm": 0.17623732882686455, + "kl": 0.0133514404296875, + "learning_rate": 4.889010052956233e-07, + "loss": 0.0184, + "reward": 1.1956195682287216, + "reward_std": 0.38174545764923096, + "rewards/accuracy_reward": 0.7551020234823227, + "rewards/improved_len_reward_dast": 0.44051752984523773, + "step": 227 + }, + { + "completion_length": 1186.4795837402344, + "epoch": 0.5910563836681789, + "grad_norm": 0.19103765244425439, + "kl": 0.00911712646484375, + "learning_rate": 4.848555299024798e-07, + "loss": -0.0025, + "reward": 1.3858640789985657, + "reward_std": 0.2998353075236082, + "rewards/accuracy_reward": 0.8724489808082581, + "rewards/improved_len_reward_dast": 0.5134151205420494, + "step": 228 + }, + { + "completion_length": 1717.0713806152344, + "epoch": 0.593648736228127, + "grad_norm": 0.1787260124676487, + "kl": 0.01560211181640625, + "learning_rate": 4.80815425100833e-07, + "loss": 0.0131, + "reward": 1.2940033674240112, + "reward_std": 0.3880784399807453, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.5031870305538177, + "step": 229 + }, + { + "completion_length": 1570.3979187011719, + "epoch": 0.5962410887880751, + "grad_norm": 0.1932563584259016, + "kl": 0.0125732421875, + "learning_rate": 4.7678102396196983e-07, + "loss": 0.0028, + "reward": 1.194681242108345, + "reward_std": 0.36879952996969223, + "rewards/accuracy_reward": 0.7704081386327744, + "rewards/improved_len_reward_dast": 0.4242731127887964, + "step": 230 + }, + { + "completion_length": 1627.1173400878906, + "epoch": 0.5988334413480233, + "grad_norm": 0.20069193255347081, + "kl": 0.01148223876953125, + "learning_rate": 4.727526590869605e-07, + "loss": -0.0024, + "reward": 1.2599404603242874, + "reward_std": 0.3717983737587929, + "rewards/accuracy_reward": 0.8061224222183228, + "rewards/improved_len_reward_dast": 0.45381802320480347, + "step": 231 + }, + { + "completion_length": 1422.693832397461, + "epoch": 0.6014257939079715, + "grad_norm": 0.22397903045763606, + "kl": 0.011993408203125, + "learning_rate": 4.6873066257923735e-07, + "loss": -0.0198, + "reward": 1.1824947893619537, + "reward_std": 0.3314864858984947, + "rewards/accuracy_reward": 0.7806122153997421, + "rewards/improved_len_reward_dast": 0.4018825590610504, + "step": 232 + }, + { + "completion_length": 2077.2550659179688, + "epoch": 0.6040181464679196, + "grad_norm": 0.2622807945246562, + "kl": 0.0151519775390625, + "learning_rate": 4.647153660172173e-07, + "loss": 0.0607, + "reward": 1.1635594964027405, + "reward_std": 0.392416313290596, + "rewards/accuracy_reward": 0.7499999701976776, + "rewards/improved_len_reward_dast": 0.4135594889521599, + "step": 233 + }, + { + "completion_length": 1738.4336547851562, + "epoch": 0.6066104990278678, + "grad_norm": 0.24814578097643056, + "kl": 0.01483917236328125, + "learning_rate": 4.607071004269647e-07, + "loss": 0.031, + "reward": 1.369605004787445, + "reward_std": 0.3843038082122803, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.5583804696798325, + "step": 234 + }, + { + "completion_length": 1602.0713806152344, + "epoch": 0.609202851587816, + "grad_norm": 0.2094489678458985, + "kl": 0.01458740234375, + "learning_rate": 4.567061962549025e-07, + "loss": -0.0277, + "reward": 1.1768890023231506, + "reward_std": 0.5075602382421494, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.4115828797221184, + "step": 235 + }, + { + "completion_length": 1883.586669921875, + "epoch": 0.6117952041477641, + "grad_norm": 0.18539849926073623, + "kl": 0.01873779296875, + "learning_rate": 4.527129833405687e-07, + "loss": 0.0234, + "reward": 1.2962508648633957, + "reward_std": 0.23112722299993038, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.5309447646141052, + "step": 236 + }, + { + "completion_length": 1541.188720703125, + "epoch": 0.6143875567077123, + "grad_norm": 0.2211580384146908, + "kl": 0.013671875, + "learning_rate": 4.4872779088942425e-07, + "loss": 0.027, + "reward": 1.3446270525455475, + "reward_std": 0.4020156227052212, + "rewards/accuracy_reward": 0.8265305906534195, + "rewards/improved_len_reward_dast": 0.5180964693427086, + "step": 237 + }, + { + "completion_length": 1877.1122131347656, + "epoch": 0.6169799092676604, + "grad_norm": 0.27937868976565, + "kl": 0.0175018310546875, + "learning_rate": 4.447509474457135e-07, + "loss": -0.0519, + "reward": 1.3078001737594604, + "reward_std": 0.3943771682679653, + "rewards/accuracy_reward": 0.811224490404129, + "rewards/improved_len_reward_dast": 0.49657563865184784, + "step": 238 + }, + { + "completion_length": 1735.6836547851562, + "epoch": 0.6195722618276086, + "grad_norm": 0.19004402096856263, + "kl": 0.013519287109375, + "learning_rate": 4.4078278086537823e-07, + "loss": 0.019, + "reward": 1.430199384689331, + "reward_std": 0.45470841974020004, + "rewards/accuracy_reward": 0.8418367207050323, + "rewards/improved_len_reward_dast": 0.5883626788854599, + "step": 239 + }, + { + "completion_length": 1290.8877258300781, + "epoch": 0.6221646143875567, + "grad_norm": 0.20039034607000805, + "kl": 0.00916290283203125, + "learning_rate": 4.3682361828902846e-07, + "loss": 0.0204, + "reward": 1.4429042339324951, + "reward_std": 0.40230638161301613, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.5857614576816559, + "step": 240 + }, + { + "completion_length": 1543.5713958740234, + "epoch": 0.6247569669475048, + "grad_norm": 0.1796128893155037, + "kl": 0.0121002197265625, + "learning_rate": 4.328737861149726e-07, + "loss": 0.0061, + "reward": 1.060480311512947, + "reward_std": 0.4090285710990429, + "rewards/accuracy_reward": 0.7040816247463226, + "rewards/improved_len_reward_dast": 0.35639870166778564, + "step": 241 + }, + { + "completion_length": 1650.6581420898438, + "epoch": 0.627349319507453, + "grad_norm": 0.17035045538288204, + "kl": 0.0127410888671875, + "learning_rate": 4.289336099723098e-07, + "loss": -0.0068, + "reward": 1.2868027091026306, + "reward_std": 0.4846101552248001, + "rewards/accuracy_reward": 0.795918345451355, + "rewards/improved_len_reward_dast": 0.49088432639837265, + "step": 242 + }, + { + "completion_length": 1806.8724212646484, + "epoch": 0.6299416720674011, + "grad_norm": 0.21153725027052578, + "kl": 0.01531982421875, + "learning_rate": 4.250034146940834e-07, + "loss": 0.0342, + "reward": 1.3773571997880936, + "reward_std": 0.32580330967903137, + "rewards/accuracy_reward": 0.8265305906534195, + "rewards/improved_len_reward_dast": 0.5508265644311905, + "step": 243 + }, + { + "completion_length": 1506.8877410888672, + "epoch": 0.6325340246273493, + "grad_norm": 0.20274200364313702, + "kl": 0.01300048828125, + "learning_rate": 4.210835242905023e-07, + "loss": 0.0114, + "reward": 1.3944001197814941, + "reward_std": 0.35993905924260616, + "rewards/accuracy_reward": 0.867346927523613, + "rewards/improved_len_reward_dast": 0.5270532071590424, + "step": 244 + }, + { + "completion_length": 1694.5713806152344, + "epoch": 0.6351263771872975, + "grad_norm": 0.20631633070295144, + "kl": 0.01531982421875, + "learning_rate": 4.1717426192222784e-07, + "loss": 0.001, + "reward": 1.269565299153328, + "reward_std": 0.3799453191459179, + "rewards/accuracy_reward": 0.7908162921667099, + "rewards/improved_len_reward_dast": 0.4787489101290703, + "step": 245 + }, + { + "completion_length": 2018.9642028808594, + "epoch": 0.6377187297472456, + "grad_norm": 0.23377044647625822, + "kl": 0.01549530029296875, + "learning_rate": 4.1327594987373347e-07, + "loss": 0.0057, + "reward": 0.9710913375020027, + "reward_std": 0.4150635525584221, + "rewards/accuracy_reward": 0.6479591578245163, + "rewards/improved_len_reward_dast": 0.3231321321800351, + "step": 246 + }, + { + "completion_length": 1953.44384765625, + "epoch": 0.6403110823071938, + "grad_norm": 0.18922091960973522, + "kl": 0.0152740478515625, + "learning_rate": 4.0938890952673443e-07, + "loss": -0.0073, + "reward": 1.144493117928505, + "reward_std": 0.326381828635931, + "rewards/accuracy_reward": 0.6989795714616776, + "rewards/improved_len_reward_dast": 0.445513516664505, + "step": 247 + }, + { + "completion_length": 1779.9234771728516, + "epoch": 0.642903434867142, + "grad_norm": 0.19009690153217312, + "kl": 0.01587677001953125, + "learning_rate": 4.05513461333693e-07, + "loss": 0.0056, + "reward": 1.2144882082939148, + "reward_std": 0.3660648465156555, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.45428410917520523, + "step": 248 + }, + { + "completion_length": 1680.5816040039062, + "epoch": 0.6454957874270901, + "grad_norm": 0.18737871436935236, + "kl": 0.01519775390625, + "learning_rate": 4.016499247913994e-07, + "loss": 0.0155, + "reward": 1.228882908821106, + "reward_std": 0.42849814891815186, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.4584747403860092, + "step": 249 + }, + { + "completion_length": 1700.0765075683594, + "epoch": 0.6480881399870383, + "grad_norm": 0.19083582747427946, + "kl": 0.01373291015625, + "learning_rate": 3.977986184146328e-07, + "loss": 0.0276, + "reward": 1.4491282403469086, + "reward_std": 0.29963432252407074, + "rewards/accuracy_reward": 0.8469387590885162, + "rewards/improved_len_reward_dast": 0.6021894812583923, + "step": 250 + }, + { + "completion_length": 1699.5050964355469, + "epoch": 0.6506804925469863, + "grad_norm": 0.18294974628895902, + "kl": 0.01318359375, + "learning_rate": 3.939598597099022e-07, + "loss": -0.0028, + "reward": 1.1291119307279587, + "reward_std": 0.4640827924013138, + "rewards/accuracy_reward": 0.7499999850988388, + "rewards/improved_len_reward_dast": 0.3791119046509266, + "step": 251 + }, + { + "completion_length": 1555.9489135742188, + "epoch": 0.6532728451069345, + "grad_norm": 0.2987585035266382, + "kl": 0.013702392578125, + "learning_rate": 3.9013396514927076e-07, + "loss": -0.0182, + "reward": 1.2567480206489563, + "reward_std": 0.38375869020819664, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.4710337221622467, + "step": 252 + }, + { + "completion_length": 2022.5509643554688, + "epoch": 0.6558651976668827, + "grad_norm": 0.16778625708063813, + "kl": 0.0160064697265625, + "learning_rate": 3.8632125014426566e-07, + "loss": 0.0026, + "reward": 1.0748438835144043, + "reward_std": 0.3207223527133465, + "rewards/accuracy_reward": 0.6836734712123871, + "rewards/improved_len_reward_dast": 0.3911704570055008, + "step": 253 + }, + { + "completion_length": 2008.7550659179688, + "epoch": 0.6584575502268308, + "grad_norm": 0.20081517128616475, + "kl": 0.017364501953125, + "learning_rate": 3.8252202901987474e-07, + "loss": -0.0036, + "reward": 1.1095408350229263, + "reward_std": 0.42732013761997223, + "rewards/accuracy_reward": 0.7193877398967743, + "rewards/improved_len_reward_dast": 0.39015308022499084, + "step": 254 + }, + { + "completion_length": 1753.5305786132812, + "epoch": 0.661049902786779, + "grad_norm": 0.19286213527020518, + "kl": 0.015838623046875, + "learning_rate": 3.7873661498863384e-07, + "loss": -0.0193, + "reward": 1.3401989042758942, + "reward_std": 0.44482723623514175, + "rewards/accuracy_reward": 0.8367346823215485, + "rewards/improved_len_reward_dast": 0.5034642219543457, + "step": 255 + }, + { + "completion_length": 1714.8316040039062, + "epoch": 0.6636422553467272, + "grad_norm": 0.19098352531749854, + "kl": 0.015716552734375, + "learning_rate": 3.7496532012480463e-07, + "loss": -0.0172, + "reward": 1.285597413778305, + "reward_std": 0.3779995068907738, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.4947810471057892, + "step": 256 + }, + { + "completion_length": 1587.0254821777344, + "epoch": 0.6662346079066753, + "grad_norm": 0.1828164836366847, + "kl": 0.01513671875, + "learning_rate": 3.7120845533864706e-07, + "loss": 0.0165, + "reward": 1.2909784018993378, + "reward_std": 0.3537175990641117, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.5001621246337891, + "step": 257 + }, + { + "completion_length": 1945.8519897460938, + "epoch": 0.6688269604666235, + "grad_norm": 0.2401064586242113, + "kl": 0.018310546875, + "learning_rate": 3.6746633035078723e-07, + "loss": -0.0254, + "reward": 0.9318393021821976, + "reward_std": 0.3634992204606533, + "rewards/accuracy_reward": 0.6530612260103226, + "rewards/improved_len_reward_dast": 0.2787781246006489, + "step": 258 + }, + { + "completion_length": 1464.5356903076172, + "epoch": 0.6714193130265717, + "grad_norm": 0.19897550034047456, + "kl": 0.0117645263671875, + "learning_rate": 3.63739253666684e-07, + "loss": 0.0257, + "reward": 1.3326016068458557, + "reward_std": 0.25891564041376114, + "rewards/accuracy_reward": 0.8469387590885162, + "rewards/improved_len_reward_dast": 0.48566286638379097, + "step": 259 + }, + { + "completion_length": 2040.6173095703125, + "epoch": 0.6740116655865198, + "grad_norm": 0.2093225075876704, + "kl": 0.01587677001953125, + "learning_rate": 3.6002753255119533e-07, + "loss": 0.0446, + "reward": 1.1549495160579681, + "reward_std": 0.6060752719640732, + "rewards/accuracy_reward": 0.7295918166637421, + "rewards/improved_len_reward_dast": 0.42535772174596786, + "step": 260 + }, + { + "completion_length": 1504.892837524414, + "epoch": 0.6766040181464679, + "grad_norm": 0.2413238757963301, + "kl": 0.013092041015625, + "learning_rate": 3.5633147300324706e-07, + "loss": 0.039, + "reward": 1.3253722488880157, + "reward_std": 0.22303567081689835, + "rewards/accuracy_reward": 0.7755101919174194, + "rewards/improved_len_reward_dast": 0.5498620271682739, + "step": 261 + }, + { + "completion_length": 1835.6020202636719, + "epoch": 0.679196370706416, + "grad_norm": 0.1742605810963208, + "kl": 0.0152587890625, + "learning_rate": 3.526513797306051e-07, + "loss": 0.023, + "reward": 1.3810910284519196, + "reward_std": 0.3878571353852749, + "rewards/accuracy_reward": 0.8469387590885162, + "rewards/improved_len_reward_dast": 0.5341522693634033, + "step": 262 + }, + { + "completion_length": 1934.44384765625, + "epoch": 0.6817887232663642, + "grad_norm": 0.18402016017590034, + "kl": 0.0189971923828125, + "learning_rate": 3.489875561247568e-07, + "loss": 0.0326, + "reward": 1.1064758449792862, + "reward_std": 0.5427646264433861, + "rewards/accuracy_reward": 0.75, + "rewards/improved_len_reward_dast": 0.3564758636057377, + "step": 263 + }, + { + "completion_length": 1527.6479187011719, + "epoch": 0.6843810758263124, + "grad_norm": 0.2535051321853217, + "kl": 0.0133209228515625, + "learning_rate": 3.453403042358968e-07, + "loss": 0.0594, + "reward": 1.3837721645832062, + "reward_std": 0.3384307250380516, + "rewards/accuracy_reward": 0.8571428507566452, + "rewards/improved_len_reward_dast": 0.5266292989253998, + "step": 264 + }, + { + "completion_length": 1750.1275329589844, + "epoch": 0.6869734283862605, + "grad_norm": 0.20005193883523226, + "kl": 0.014312744140625, + "learning_rate": 3.417099247480277e-07, + "loss": 0.0069, + "reward": 1.1163494735956192, + "reward_std": 0.4810503050684929, + "rewards/accuracy_reward": 0.7295918166637421, + "rewards/improved_len_reward_dast": 0.3867576252669096, + "step": 265 + }, + { + "completion_length": 1910.5254821777344, + "epoch": 0.6895657809462087, + "grad_norm": 0.3018048627256463, + "kl": 0.0156402587890625, + "learning_rate": 3.3809671695416916e-07, + "loss": 0.0357, + "reward": 1.147754654288292, + "reward_std": 0.5025169178843498, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.3824485056102276, + "step": 266 + }, + { + "completion_length": 1284.0663146972656, + "epoch": 0.6921581335061568, + "grad_norm": 0.18258330323366856, + "kl": 0.0092926025390625, + "learning_rate": 3.345009787316859e-07, + "loss": 0.0015, + "reward": 1.4202894866466522, + "reward_std": 0.2870555892586708, + "rewards/accuracy_reward": 0.8418367058038712, + "rewards/improved_len_reward_dast": 0.5784527361392975, + "step": 267 + }, + { + "completion_length": 1557.5612030029297, + "epoch": 0.694750486066105, + "grad_norm": 0.1849700340313966, + "kl": 0.012725830078125, + "learning_rate": 3.309230065177289e-07, + "loss": -0.0079, + "reward": 1.4877441823482513, + "reward_std": 0.302555400878191, + "rewards/accuracy_reward": 0.8622448742389679, + "rewards/improved_len_reward_dast": 0.6254993677139282, + "step": 268 + }, + { + "completion_length": 1482.5203552246094, + "epoch": 0.6973428386260532, + "grad_norm": 0.19171071001803489, + "kl": 0.0144500732421875, + "learning_rate": 3.273630952847971e-07, + "loss": -0.0012, + "reward": 1.2047373950481415, + "reward_std": 0.48537394404411316, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.4445333182811737, + "step": 269 + }, + { + "completion_length": 1744.6070861816406, + "epoch": 0.6999351911860013, + "grad_norm": 0.17132128213246742, + "kl": 0.01513671875, + "learning_rate": 3.2382153851641996e-07, + "loss": 0.0229, + "reward": 1.1097373962402344, + "reward_std": 0.2911606300622225, + "rewards/accuracy_reward": 0.7295918166637421, + "rewards/improved_len_reward_dast": 0.38014551997184753, + "step": 270 + }, + { + "completion_length": 1705.5968933105469, + "epoch": 0.7025275437459495, + "grad_norm": 0.2582533948663525, + "kl": 0.01708984375, + "learning_rate": 3.202986281829616e-07, + "loss": 0.045, + "reward": 1.3047520220279694, + "reward_std": 0.4435114786028862, + "rewards/accuracy_reward": 0.8061224222183228, + "rewards/improved_len_reward_dast": 0.4986295886337757, + "step": 271 + }, + { + "completion_length": 1806.591796875, + "epoch": 0.7051198963058976, + "grad_norm": 0.17993615347196873, + "kl": 0.01581573486328125, + "learning_rate": 3.1679465471755106e-07, + "loss": 0.016, + "reward": 1.2005809843540192, + "reward_std": 0.2893667705357075, + "rewards/accuracy_reward": 0.7448979467153549, + "rewards/improved_len_reward_dast": 0.45568302273750305, + "step": 272 + }, + { + "completion_length": 1960.2244262695312, + "epoch": 0.7077122488658457, + "grad_norm": 0.21394731890393012, + "kl": 0.018402099609375, + "learning_rate": 3.1330990699213824e-07, + "loss": 0.0026, + "reward": 1.3150149285793304, + "reward_std": 0.32834067940711975, + "rewards/accuracy_reward": 0.7602040469646454, + "rewards/improved_len_reward_dast": 0.5548108592629433, + "step": 273 + }, + { + "completion_length": 1648.7601623535156, + "epoch": 0.7103046014257939, + "grad_norm": 0.22677843577967902, + "kl": 0.0144500732421875, + "learning_rate": 3.0984467229367885e-07, + "loss": -0.0289, + "reward": 1.186056673526764, + "reward_std": 0.3048909828066826, + "rewards/accuracy_reward": 0.7653061002492905, + "rewards/improved_len_reward_dast": 0.42075058072805405, + "step": 274 + }, + { + "completion_length": 1631.3876953125, + "epoch": 0.712896953985742, + "grad_norm": 0.18075852179231652, + "kl": 0.0135955810546875, + "learning_rate": 3.063992363004503e-07, + "loss": -0.0047, + "reward": 1.3900758624076843, + "reward_std": 0.35281531512737274, + "rewards/accuracy_reward": 0.8163264989852905, + "rewards/improved_len_reward_dast": 0.5737493187189102, + "step": 275 + }, + { + "completion_length": 1794.5203857421875, + "epoch": 0.7154893065456902, + "grad_norm": 0.20597152512904204, + "kl": 0.0141143798828125, + "learning_rate": 3.0297388305850004e-07, + "loss": 0.0135, + "reward": 1.2308696657419205, + "reward_std": 0.3947853706777096, + "rewards/accuracy_reward": 0.7959183603525162, + "rewards/improved_len_reward_dast": 0.434951264411211, + "step": 276 + }, + { + "completion_length": 1608.892822265625, + "epoch": 0.7180816591056384, + "grad_norm": 0.22201185510570046, + "kl": 0.0151519775390625, + "learning_rate": 2.9956889495822877e-07, + "loss": 0.0463, + "reward": 1.3714110851287842, + "reward_std": 0.41973991319537163, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.549982562661171, + "step": 277 + }, + { + "completion_length": 1833.0203552246094, + "epoch": 0.7206740116655865, + "grad_norm": 0.18677648497687657, + "kl": 0.0153656005859375, + "learning_rate": 2.961845527111091e-07, + "loss": 0.0087, + "reward": 1.1960042417049408, + "reward_std": 0.35424697771668434, + "rewards/accuracy_reward": 0.7499999850988388, + "rewards/improved_len_reward_dast": 0.4460042342543602, + "step": 278 + }, + { + "completion_length": 1663.1989440917969, + "epoch": 0.7232663642255347, + "grad_norm": 0.23408313686800128, + "kl": 0.0152435302734375, + "learning_rate": 2.9282113532654363e-07, + "loss": 0.0496, + "reward": 1.2954119145870209, + "reward_std": 0.4828920140862465, + "rewards/accuracy_reward": 0.8265306055545807, + "rewards/improved_len_reward_dast": 0.46888134628534317, + "step": 279 + }, + { + "completion_length": 1693.0254974365234, + "epoch": 0.7258587167854829, + "grad_norm": 0.23913668563173046, + "kl": 0.019439697265625, + "learning_rate": 2.894789200888634e-07, + "loss": 0.0174, + "reward": 1.4143796861171722, + "reward_std": 0.37724653631448746, + "rewards/accuracy_reward": 0.8367346674203873, + "rewards/improved_len_reward_dast": 0.5776450335979462, + "step": 280 + }, + { + "completion_length": 1277.8468780517578, + "epoch": 0.728451069345431, + "grad_norm": 0.2694215840510146, + "kl": 0.0134429931640625, + "learning_rate": 2.8615818253446766e-07, + "loss": 0.0046, + "reward": 1.4540930390357971, + "reward_std": 0.3243625983595848, + "rewards/accuracy_reward": 0.8775509893894196, + "rewards/improved_len_reward_dast": 0.5765420496463776, + "step": 281 + }, + { + "completion_length": 1236.0356903076172, + "epoch": 0.7310434219053791, + "grad_norm": 0.1871177689494516, + "kl": 0.0116729736328125, + "learning_rate": 2.828591964291093e-07, + "loss": 0.0055, + "reward": 1.2881307899951935, + "reward_std": 0.42027105391025543, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.466702226549387, + "step": 282 + }, + { + "completion_length": 1389.3673095703125, + "epoch": 0.7336357744653272, + "grad_norm": 0.17949852486745174, + "kl": 0.0106201171875, + "learning_rate": 2.7958223374532363e-07, + "loss": -0.029, + "reward": 1.2979092001914978, + "reward_std": 0.34224472381174564, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.4407663494348526, + "step": 283 + }, + { + "completion_length": 1291.64794921875, + "epoch": 0.7362281270252754, + "grad_norm": 0.20498717449578613, + "kl": 0.01025390625, + "learning_rate": 2.7632756464000835e-07, + "loss": 0.0333, + "reward": 1.6148460805416107, + "reward_std": 0.25412340462207794, + "rewards/accuracy_reward": 0.9234693795442581, + "rewards/improved_len_reward_dast": 0.6913766860961914, + "step": 284 + }, + { + "completion_length": 1941.4284973144531, + "epoch": 0.7388204795852236, + "grad_norm": 0.19896247201933293, + "kl": 0.019378662109375, + "learning_rate": 2.730954574321503e-07, + "loss": 0.0303, + "reward": 1.0792112797498703, + "reward_std": 0.38586486876010895, + "rewards/accuracy_reward": 0.7142857015132904, + "rewards/improved_len_reward_dast": 0.3649255894124508, + "step": 285 + }, + { + "completion_length": 1503.8826446533203, + "epoch": 0.7414128321451717, + "grad_norm": 0.22350544706234096, + "kl": 0.01275634765625, + "learning_rate": 2.698861785807055e-07, + "loss": 0.0311, + "reward": 1.5651328265666962, + "reward_std": 0.3553974963724613, + "rewards/accuracy_reward": 0.9030612260103226, + "rewards/improved_len_reward_dast": 0.6620715856552124, + "step": 286 + }, + { + "completion_length": 1731.8214111328125, + "epoch": 0.7440051847051199, + "grad_norm": 0.23609281842069962, + "kl": 0.0157470703125, + "learning_rate": 2.6669999266263154e-07, + "loss": -0.0306, + "reward": 1.1723814904689789, + "reward_std": 0.5022178217768669, + "rewards/accuracy_reward": 0.7602040767669678, + "rewards/improved_len_reward_dast": 0.41217736527323723, + "step": 287 + }, + { + "completion_length": 1870.0458679199219, + "epoch": 0.7465975372650681, + "grad_norm": 0.15632978700328695, + "kl": 0.0158843994140625, + "learning_rate": 2.635371623510758e-07, + "loss": 0.0204, + "reward": 1.0800221413373947, + "reward_std": 0.2878151945769787, + "rewards/accuracy_reward": 0.6887754872441292, + "rewards/improved_len_reward_dast": 0.39124663546681404, + "step": 288 + }, + { + "completion_length": 1414.2703552246094, + "epoch": 0.7491898898250162, + "grad_norm": 0.23286966119816113, + "kl": 0.0133056640625, + "learning_rate": 2.6039794839372066e-07, + "loss": -0.0074, + "reward": 1.341863602399826, + "reward_std": 0.36198627576231956, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.530639074742794, + "step": 289 + }, + { + "completion_length": 1749.2295532226562, + "epoch": 0.7517822423849644, + "grad_norm": 0.17241966258758817, + "kl": 0.0135955810546875, + "learning_rate": 2.5728260959128614e-07, + "loss": -0.0129, + "reward": 1.2213443964719772, + "reward_std": 0.4387034922838211, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.46114034205675125, + "step": 290 + }, + { + "completion_length": 2126.826446533203, + "epoch": 0.7543745949449125, + "grad_norm": 0.2030042278234921, + "kl": 0.018890380859375, + "learning_rate": 2.541914027761951e-07, + "loss": 0.0435, + "reward": 1.1566181033849716, + "reward_std": 0.505137488245964, + "rewards/accuracy_reward": 0.7244897782802582, + "rewards/improved_len_reward_dast": 0.43212827295064926, + "step": 291 + }, + { + "completion_length": 1632.0713653564453, + "epoch": 0.7569669475048607, + "grad_norm": 0.24718377241844533, + "kl": 0.016876220703125, + "learning_rate": 2.511245827913991e-07, + "loss": 0.0421, + "reward": 1.2267541885375977, + "reward_std": 0.3394501358270645, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.4563460126519203, + "step": 292 + }, + { + "completion_length": 1807.6529846191406, + "epoch": 0.7595593000648088, + "grad_norm": 0.1861047697263272, + "kl": 0.01556396484375, + "learning_rate": 2.4808240246936866e-07, + "loss": -0.0078, + "reward": 1.2387667298316956, + "reward_std": 0.4819525480270386, + "rewards/accuracy_reward": 0.795918345451355, + "rewards/improved_len_reward_dast": 0.44284842535853386, + "step": 293 + }, + { + "completion_length": 1847.19384765625, + "epoch": 0.7621516526247569, + "grad_norm": 0.22670935044930915, + "kl": 0.018310546875, + "learning_rate": 2.450651126112504e-07, + "loss": 0.0266, + "reward": 1.4322427809238434, + "reward_std": 0.2754583992063999, + "rewards/accuracy_reward": 0.8418367058038712, + "rewards/improved_len_reward_dast": 0.590406060218811, + "step": 294 + }, + { + "completion_length": 1595.9795532226562, + "epoch": 0.7647440051847051, + "grad_norm": 0.20527730505286215, + "kl": 0.015838623046875, + "learning_rate": 2.4207296196618924e-07, + "loss": 0.0242, + "reward": 1.3626587092876434, + "reward_std": 0.32539451494812965, + "rewards/accuracy_reward": 0.7908162921667099, + "rewards/improved_len_reward_dast": 0.5718424171209335, + "step": 295 + }, + { + "completion_length": 1054.137710571289, + "epoch": 0.7673363577446533, + "grad_norm": 0.21493362850187817, + "kl": 0.0093536376953125, + "learning_rate": 2.3910619721082253e-07, + "loss": 0.0196, + "reward": 1.4152240753173828, + "reward_std": 0.35989922285079956, + "rewards/accuracy_reward": 0.867346927523613, + "rewards/improved_len_reward_dast": 0.5478771775960922, + "step": 296 + }, + { + "completion_length": 1474.3367004394531, + "epoch": 0.7699287103046014, + "grad_norm": 0.20358206304391516, + "kl": 0.0144500732421875, + "learning_rate": 2.3616506292894282e-07, + "loss": 0.0271, + "reward": 1.4626062214374542, + "reward_std": 0.29278943687677383, + "rewards/accuracy_reward": 0.8775510042905807, + "rewards/improved_len_reward_dast": 0.5850552245974541, + "step": 297 + }, + { + "completion_length": 1752.2295227050781, + "epoch": 0.7725210628645496, + "grad_norm": 0.1833066106969091, + "kl": 0.015289306640625, + "learning_rate": 2.332498015913344e-07, + "loss": 0.0009, + "reward": 1.3457911014556885, + "reward_std": 0.2773626856505871, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.5345666632056236, + "step": 298 + }, + { + "completion_length": 1325.688720703125, + "epoch": 0.7751134154244977, + "grad_norm": 0.19517765602950424, + "kl": 0.01210784912109375, + "learning_rate": 2.303606535357843e-07, + "loss": 0.0599, + "reward": 1.5037426948547363, + "reward_std": 0.26091703958809376, + "rewards/accuracy_reward": 0.8775510191917419, + "rewards/improved_len_reward_dast": 0.6261917278170586, + "step": 299 + }, + { + "completion_length": 1663.0662689208984, + "epoch": 0.7777057679844459, + "grad_norm": 0.20601240191104908, + "kl": 0.01605224609375, + "learning_rate": 2.2749785694726685e-07, + "loss": 0.0094, + "reward": 1.3560754358768463, + "reward_std": 0.37762896716594696, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.5346468687057495, + "step": 300 + }, + { + "completion_length": 1426.6173095703125, + "epoch": 0.7802981205443941, + "grad_norm": 0.20108821286385423, + "kl": 0.0143585205078125, + "learning_rate": 2.2466164783830972e-07, + "loss": 0.0207, + "reward": 1.3399082869291306, + "reward_std": 0.3976980447769165, + "rewards/accuracy_reward": 0.806122437119484, + "rewards/improved_len_reward_dast": 0.5337858349084854, + "step": 301 + }, + { + "completion_length": 1790.8978881835938, + "epoch": 0.7828904731043422, + "grad_norm": 0.21383459811515595, + "kl": 0.0155029296875, + "learning_rate": 2.2185226002953483e-07, + "loss": 0.0004, + "reward": 1.2710506618022919, + "reward_std": 0.3618534617125988, + "rewards/accuracy_reward": 0.785714253783226, + "rewards/improved_len_reward_dast": 0.4853363707661629, + "step": 302 + }, + { + "completion_length": 1939.8775024414062, + "epoch": 0.7854828256642904, + "grad_norm": 0.29379980912133363, + "kl": 0.01885986328125, + "learning_rate": 2.1906992513038268e-07, + "loss": 0.0479, + "reward": 1.2805213034152985, + "reward_std": 0.4143086224794388, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.4692968502640724, + "step": 303 + }, + { + "completion_length": 1614.3775329589844, + "epoch": 0.7880751782242384, + "grad_norm": 0.17729210448855, + "kl": 0.0162353515625, + "learning_rate": 2.1631487252001822e-07, + "loss": 0.0049, + "reward": 1.234568029642105, + "reward_std": 0.417904369533062, + "rewards/accuracy_reward": 0.8010203838348389, + "rewards/improved_len_reward_dast": 0.43354763835668564, + "step": 304 + }, + { + "completion_length": 2287.780548095703, + "epoch": 0.7906675307841866, + "grad_norm": 1.2242934021255432, + "kl": 0.021087646484375, + "learning_rate": 2.1358732932842032e-07, + "loss": 0.0211, + "reward": 1.0315402448177338, + "reward_std": 0.36217188835144043, + "rewards/accuracy_reward": 0.6581632494926453, + "rewards/improved_len_reward_dast": 0.3733769580721855, + "step": 305 + }, + { + "completion_length": 1723.3673400878906, + "epoch": 0.7932598833441348, + "grad_norm": 0.20686736211065535, + "kl": 0.015533447265625, + "learning_rate": 2.1088752041765734e-07, + "loss": 0.0319, + "reward": 1.3500191867351532, + "reward_std": 0.3599831163883209, + "rewards/accuracy_reward": 0.8061224222183228, + "rewards/improved_len_reward_dast": 0.5438967421650887, + "step": 306 + }, + { + "completion_length": 1528.9183654785156, + "epoch": 0.7958522359040829, + "grad_norm": 0.21573348295043995, + "kl": 0.015960693359375, + "learning_rate": 2.0821566836334847e-07, + "loss": -0.0098, + "reward": 1.3639625310897827, + "reward_std": 0.3467046692967415, + "rewards/accuracy_reward": 0.8469387590885162, + "rewards/improved_len_reward_dast": 0.5170237571001053, + "step": 307 + }, + { + "completion_length": 1429.280532836914, + "epoch": 0.7984445884640311, + "grad_norm": 0.18304725042811948, + "kl": 0.01262664794921875, + "learning_rate": 2.0557199343631494e-07, + "loss": 0.0087, + "reward": 1.2729185968637466, + "reward_std": 0.37279824167490005, + "rewards/accuracy_reward": 0.8061224520206451, + "rewards/improved_len_reward_dast": 0.4667961820960045, + "step": 308 + }, + { + "completion_length": 1876.0458679199219, + "epoch": 0.8010369410239793, + "grad_norm": 0.20278131778947003, + "kl": 0.01853179931640625, + "learning_rate": 2.0295671358442033e-07, + "loss": 0.019, + "reward": 1.3648760467767715, + "reward_std": 0.3640540838241577, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.5536516159772873, + "step": 309 + }, + { + "completion_length": 1463.239730834961, + "epoch": 0.8036292935839274, + "grad_norm": 0.22793846718497435, + "kl": 0.014312744140625, + "learning_rate": 2.0037004441460263e-07, + "loss": 0.0287, + "reward": 1.3905141055583954, + "reward_std": 0.41797252371907234, + "rewards/accuracy_reward": 0.8418367207050323, + "rewards/improved_len_reward_dast": 0.5486774370074272, + "step": 310 + }, + { + "completion_length": 1581.4999542236328, + "epoch": 0.8062216461438756, + "grad_norm": 0.2080094216762287, + "kl": 0.01576995849609375, + "learning_rate": 1.9781219917509987e-07, + "loss": 0.0138, + "reward": 1.4025911092758179, + "reward_std": 0.3261520601809025, + "rewards/accuracy_reward": 0.8265306055545807, + "rewards/improved_len_reward_dast": 0.5760605186223984, + "step": 311 + }, + { + "completion_length": 1737.1019897460938, + "epoch": 0.8088139987038238, + "grad_norm": 0.22193491426249878, + "kl": 0.0164794921875, + "learning_rate": 1.9528338873786882e-07, + "loss": 0.0217, + "reward": 1.1316132843494415, + "reward_std": 0.44266829639673233, + "rewards/accuracy_reward": 0.7397959157824516, + "rewards/improved_len_reward_dast": 0.39181735552847385, + "step": 312 + }, + { + "completion_length": 1681.6224060058594, + "epoch": 0.8114063512637719, + "grad_norm": 0.21692033379747663, + "kl": 0.0162506103515625, + "learning_rate": 1.9278382158120116e-07, + "loss": 0.0256, + "reward": 1.2757752537727356, + "reward_std": 0.447167094796896, + "rewards/accuracy_reward": 0.795918345451355, + "rewards/improved_len_reward_dast": 0.4798569083213806, + "step": 313 + }, + { + "completion_length": 1513.8316040039062, + "epoch": 0.81399870382372, + "grad_norm": 0.18130741669805844, + "kl": 0.01153564453125, + "learning_rate": 1.9031370377253574e-07, + "loss": 0.0246, + "reward": 1.535945862531662, + "reward_std": 0.31188252195715904, + "rewards/accuracy_reward": 0.8826530426740646, + "rewards/improved_len_reward_dast": 0.653292790055275, + "step": 314 + }, + { + "completion_length": 1734.6632385253906, + "epoch": 0.8165910563836681, + "grad_norm": 0.18939277983218827, + "kl": 0.0179443359375, + "learning_rate": 1.8787323895147052e-07, + "loss": -0.001, + "reward": 1.1586688458919525, + "reward_std": 0.4217538684606552, + "rewards/accuracy_reward": 0.7551020234823227, + "rewards/improved_len_reward_dast": 0.4035668522119522, + "step": 315 + }, + { + "completion_length": 1650.4846496582031, + "epoch": 0.8191834089436163, + "grad_norm": 0.2171448495391751, + "kl": 0.0167999267578125, + "learning_rate": 1.8546262831297438e-07, + "loss": -0.0121, + "reward": 1.464043915271759, + "reward_std": 0.3952450007200241, + "rewards/accuracy_reward": 0.8724489510059357, + "rewards/improved_len_reward_dast": 0.5915949791669846, + "step": 316 + }, + { + "completion_length": 1495.3316040039062, + "epoch": 0.8217757615035645, + "grad_norm": 0.19836205451789388, + "kl": 0.0137481689453125, + "learning_rate": 1.8308207059079938e-07, + "loss": -0.0069, + "reward": 1.1547789573669434, + "reward_std": 0.41507500410079956, + "rewards/accuracy_reward": 0.7704081386327744, + "rewards/improved_len_reward_dast": 0.3843708522617817, + "step": 317 + }, + { + "completion_length": 1517.8367004394531, + "epoch": 0.8243681140635126, + "grad_norm": 0.20600261332668526, + "kl": 0.0160064697265625, + "learning_rate": 1.8073176204109837e-07, + "loss": 0.0437, + "reward": 1.438821941614151, + "reward_std": 0.306551206856966, + "rewards/accuracy_reward": 0.8775510042905807, + "rewards/improved_len_reward_dast": 0.5612709149718285, + "step": 318 + }, + { + "completion_length": 1504.4285278320312, + "epoch": 0.8269604666234608, + "grad_norm": 0.21261278084781152, + "kl": 0.014495849609375, + "learning_rate": 1.7841189642624428e-07, + "loss": 0.0231, + "reward": 1.229389488697052, + "reward_std": 0.4350128807127476, + "rewards/accuracy_reward": 0.7959183603525162, + "rewards/improved_len_reward_dast": 0.4334711404517293, + "step": 319 + }, + { + "completion_length": 1672.8316040039062, + "epoch": 0.829552819183409, + "grad_norm": 0.1943882700904058, + "kl": 0.0173492431640625, + "learning_rate": 1.7612266499885642e-07, + "loss": 0.0464, + "reward": 1.5176236629486084, + "reward_std": 0.3366955704987049, + "rewards/accuracy_reward": 0.8877550810575485, + "rewards/improved_len_reward_dast": 0.6298686116933823, + "step": 320 + }, + { + "completion_length": 1179.0713653564453, + "epoch": 0.8321451717433571, + "grad_norm": 0.22615060777330476, + "kl": 0.012054443359375, + "learning_rate": 1.7386425648603354e-07, + "loss": 0.0423, + "reward": 1.5581437051296234, + "reward_std": 0.234028534963727, + "rewards/accuracy_reward": 0.8979591578245163, + "rewards/improved_len_reward_dast": 0.6601845473051071, + "step": 321 + }, + { + "completion_length": 1385.7346649169922, + "epoch": 0.8347375243033053, + "grad_norm": 0.18647668905538498, + "kl": 0.0132293701171875, + "learning_rate": 1.716368570737946e-07, + "loss": -0.0176, + "reward": 1.5387031435966492, + "reward_std": 0.39274929463863373, + "rewards/accuracy_reward": 0.9081632643938065, + "rewards/improved_len_reward_dast": 0.6305398866534233, + "step": 322 + }, + { + "completion_length": 1955.0357055664062, + "epoch": 0.8373298768632534, + "grad_norm": 0.1871384863519405, + "kl": 0.01862335205078125, + "learning_rate": 1.6944065039173004e-07, + "loss": 0.0282, + "reward": 0.9992491155862808, + "reward_std": 0.4749828167259693, + "rewards/accuracy_reward": 0.6785714030265808, + "rewards/improved_len_reward_dast": 0.3206777200102806, + "step": 323 + }, + { + "completion_length": 1949.9693298339844, + "epoch": 0.8399222294232016, + "grad_norm": 0.20078422959231634, + "kl": 0.020111083984375, + "learning_rate": 1.672758174978622e-07, + "loss": 0.0315, + "reward": 1.227005422115326, + "reward_std": 0.36194342374801636, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.46680130809545517, + "step": 324 + }, + { + "completion_length": 1403.64794921875, + "epoch": 0.8425145819831497, + "grad_norm": 0.20565437549884577, + "kl": 0.0128936767578125, + "learning_rate": 1.6514253686371917e-07, + "loss": 0.0204, + "reward": 1.4708826392889023, + "reward_std": 0.2500988617539406, + "rewards/accuracy_reward": 0.8826530426740646, + "rewards/improved_len_reward_dast": 0.5882296115159988, + "step": 325 + }, + { + "completion_length": 1667.8264770507812, + "epoch": 0.8451069345430978, + "grad_norm": 0.21813136540877595, + "kl": 0.0157318115234375, + "learning_rate": 1.630409843596216e-07, + "loss": 0.0307, + "reward": 1.3411798775196075, + "reward_std": 0.32134104520082474, + "rewards/accuracy_reward": 0.8061224222183228, + "rewards/improved_len_reward_dast": 0.53505739569664, + "step": 326 + }, + { + "completion_length": 1616.2908020019531, + "epoch": 0.847699287103046, + "grad_norm": 0.1969183257495155, + "kl": 0.0156402587890625, + "learning_rate": 1.609713332401831e-07, + "loss": 0.0085, + "reward": 1.2519380450248718, + "reward_std": 0.458795890212059, + "rewards/accuracy_reward": 0.7806122452020645, + "rewards/improved_len_reward_dast": 0.4713258519768715, + "step": 327 + }, + { + "completion_length": 1625.6377258300781, + "epoch": 0.8502916396629941, + "grad_norm": 0.24417535965250406, + "kl": 0.0139617919921875, + "learning_rate": 1.5893375413002765e-07, + "loss": -0.0317, + "reward": 1.2513196468353271, + "reward_std": 0.47703811526298523, + "rewards/accuracy_reward": 0.7704081386327744, + "rewards/improved_len_reward_dast": 0.4809115380048752, + "step": 328 + }, + { + "completion_length": 2058.948944091797, + "epoch": 0.8528839922229423, + "grad_norm": 0.19451912015501954, + "kl": 0.0210418701171875, + "learning_rate": 1.569284150097226e-07, + "loss": 0.0377, + "reward": 1.2445521801710129, + "reward_std": 0.26459160074591637, + "rewards/accuracy_reward": 0.7295918315649033, + "rewards/improved_len_reward_dast": 0.5149602852761745, + "step": 329 + }, + { + "completion_length": 1789.7040405273438, + "epoch": 0.8554763447828905, + "grad_norm": 0.24266903278771249, + "kl": 0.019378662109375, + "learning_rate": 1.5495548120193003e-07, + "loss": 0.0434, + "reward": 1.322462946176529, + "reward_std": 0.38080430775880814, + "rewards/accuracy_reward": 0.8265305906534195, + "rewards/improved_len_reward_dast": 0.49593234062194824, + "step": 330 + }, + { + "completion_length": 1468.8213653564453, + "epoch": 0.8580686973428386, + "grad_norm": 0.1945755306885796, + "kl": 0.01294708251953125, + "learning_rate": 1.5301511535777784e-07, + "loss": 0.0302, + "reward": 1.5070666372776031, + "reward_std": 0.3562978059053421, + "rewards/accuracy_reward": 0.8724489510059357, + "rewards/improved_len_reward_dast": 0.6346177160739899, + "step": 331 + }, + { + "completion_length": 1581.3825988769531, + "epoch": 0.8606610499027868, + "grad_norm": 0.29272858693831433, + "kl": 0.01812744140625, + "learning_rate": 1.5110747744345006e-07, + "loss": 0.0122, + "reward": 1.3418152332305908, + "reward_std": 0.4640466570854187, + "rewards/accuracy_reward": 0.8724489659070969, + "rewards/improved_len_reward_dast": 0.46936625242233276, + "step": 332 + }, + { + "completion_length": 1786.1734313964844, + "epoch": 0.863253402462735, + "grad_norm": 0.19480551857525122, + "kl": 0.019775390625, + "learning_rate": 1.4923272472699986e-07, + "loss": -0.0042, + "reward": 1.1590133309364319, + "reward_std": 0.2618263028562069, + "rewards/accuracy_reward": 0.7193877398967743, + "rewards/improved_len_reward_dast": 0.4396255351603031, + "step": 333 + }, + { + "completion_length": 1171.147933959961, + "epoch": 0.8658457550226831, + "grad_norm": 0.23814232802014945, + "kl": 0.013671875, + "learning_rate": 1.4739101176538274e-07, + "loss": 0.0174, + "reward": 1.2705652117729187, + "reward_std": 0.3895917683839798, + "rewards/accuracy_reward": 0.8367346823215485, + "rewards/improved_len_reward_dast": 0.43383053690195084, + "step": 334 + }, + { + "completion_length": 1758.0816040039062, + "epoch": 0.8684381075826313, + "grad_norm": 0.22764969968005389, + "kl": 0.0219268798828125, + "learning_rate": 1.4558249039171639e-07, + "loss": 0.0414, + "reward": 1.358829528093338, + "reward_std": 0.38345643877983093, + "rewards/accuracy_reward": 0.8367346823215485, + "rewards/improved_len_reward_dast": 0.5220948457717896, + "step": 335 + }, + { + "completion_length": 1889.0509948730469, + "epoch": 0.8710304601425793, + "grad_norm": 0.22895792507657853, + "kl": 0.021484375, + "learning_rate": 1.4380730970276195e-07, + "loss": 0.0354, + "reward": 1.07760888338089, + "reward_std": 0.3665538318455219, + "rewards/accuracy_reward": 0.6887754797935486, + "rewards/improved_len_reward_dast": 0.3888333588838577, + "step": 336 + }, + { + "completion_length": 2373.249969482422, + "epoch": 0.8736228127025275, + "grad_norm": 0.2697468121522664, + "kl": 0.026397705078125, + "learning_rate": 1.420656160466333e-07, + "loss": -0.0102, + "reward": 1.0278730392456055, + "reward_std": 0.348503515124321, + "rewards/accuracy_reward": 0.6938775330781937, + "rewards/improved_len_reward_dast": 0.33399548195302486, + "step": 337 + }, + { + "completion_length": 1981.8978881835938, + "epoch": 0.8762151652624757, + "grad_norm": 0.20587316419649823, + "kl": 0.0223846435546875, + "learning_rate": 1.4035755301073102e-07, + "loss": 0.0273, + "reward": 1.2939772605895996, + "reward_std": 0.46924955397844315, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.5286711901426315, + "step": 338 + }, + { + "completion_length": 1536.4336395263672, + "epoch": 0.8788075178224238, + "grad_norm": 0.20611627730954438, + "kl": 0.0202789306640625, + "learning_rate": 1.386832614099056e-07, + "loss": 0.006, + "reward": 1.4531451165676117, + "reward_std": 0.3475269414484501, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.5960022807121277, + "step": 339 + }, + { + "completion_length": 1489.7652435302734, + "epoch": 0.881399870382372, + "grad_norm": 0.2223037836334228, + "kl": 0.0159454345703125, + "learning_rate": 1.3704287927484846e-07, + "loss": -0.0138, + "reward": 1.3403507471084595, + "reward_std": 0.46086446195840836, + "rewards/accuracy_reward": 0.8112244606018066, + "rewards/improved_len_reward_dast": 0.529126301407814, + "step": 340 + }, + { + "completion_length": 1788.7091674804688, + "epoch": 0.8839922229423202, + "grad_norm": 0.188880858513302, + "kl": 0.0198516845703125, + "learning_rate": 1.3543654184071186e-07, + "loss": 0.0144, + "reward": 1.320367306470871, + "reward_std": 0.2726456895470619, + "rewards/accuracy_reward": 0.7755101919174194, + "rewards/improved_len_reward_dast": 0.5448571220040321, + "step": 341 + }, + { + "completion_length": 1541.3316192626953, + "epoch": 0.8865845755022683, + "grad_norm": 0.20649364949795315, + "kl": 0.01570892333984375, + "learning_rate": 1.3386438153596067e-07, + "loss": 0.0104, + "reward": 1.327652782201767, + "reward_std": 0.3968999646604061, + "rewards/accuracy_reward": 0.846938744187355, + "rewards/improved_len_reward_dast": 0.4807140678167343, + "step": 342 + }, + { + "completion_length": 1504.8775329589844, + "epoch": 0.8891769280622165, + "grad_norm": 0.23748978746970162, + "kl": 0.0181427001953125, + "learning_rate": 1.323265279714543e-07, + "loss": -0.0172, + "reward": 1.3229451477527618, + "reward_std": 0.38034195080399513, + "rewards/accuracy_reward": 0.8265306055545807, + "rewards/improved_len_reward_dast": 0.49641457200050354, + "step": 343 + }, + { + "completion_length": 1616.14794921875, + "epoch": 0.8917692806221647, + "grad_norm": 0.228900632017236, + "kl": 0.020263671875, + "learning_rate": 1.3082310792976202e-07, + "loss": 0.0331, + "reward": 1.4383951127529144, + "reward_std": 0.32518207281827927, + "rewards/accuracy_reward": 0.8520407974720001, + "rewards/improved_len_reward_dast": 0.5863542854785919, + "step": 344 + }, + { + "completion_length": 1765.0509948730469, + "epoch": 0.8943616331821128, + "grad_norm": 0.21689615981919957, + "kl": 0.0205841064453125, + "learning_rate": 1.293542453547102e-07, + "loss": 0.0219, + "reward": 1.3277872800827026, + "reward_std": 0.4930282086133957, + "rewards/accuracy_reward": 0.8163264989852905, + "rewards/improved_len_reward_dast": 0.5114607587456703, + "step": 345 + }, + { + "completion_length": 1576.6071166992188, + "epoch": 0.8969539857420609, + "grad_norm": 0.2503011086919002, + "kl": 0.0197906494140625, + "learning_rate": 1.279200613411642e-07, + "loss": 0.044, + "reward": 1.2905025482177734, + "reward_std": 0.47432298958301544, + "rewards/accuracy_reward": 0.8214285522699356, + "rewards/improved_len_reward_dast": 0.46907395869493484, + "step": 346 + }, + { + "completion_length": 2153.3162231445312, + "epoch": 0.899546338302009, + "grad_norm": 0.23273243697852358, + "kl": 0.023712158203125, + "learning_rate": 1.2652067412504605e-07, + "loss": 0.0312, + "reward": 1.047543928027153, + "reward_std": 0.3953222408890724, + "rewards/accuracy_reward": 0.688775509595871, + "rewards/improved_len_reward_dast": 0.35876838117837906, + "step": 347 + }, + { + "completion_length": 1542.3111877441406, + "epoch": 0.9021386908619572, + "grad_norm": 0.25879665856811085, + "kl": 0.0159149169921875, + "learning_rate": 1.251561990735859e-07, + "loss": 0.0306, + "reward": 1.4665509164333344, + "reward_std": 0.34583452716469765, + "rewards/accuracy_reward": 0.867346927523613, + "rewards/improved_len_reward_dast": 0.599203959107399, + "step": 348 + }, + { + "completion_length": 2166.5713806152344, + "epoch": 0.9047310434219054, + "grad_norm": 0.21742881103681694, + "kl": 0.029144287109375, + "learning_rate": 1.238267486758117e-07, + "loss": 0.0221, + "reward": 0.9765184819698334, + "reward_std": 0.4072360023856163, + "rewards/accuracy_reward": 0.6224489733576775, + "rewards/improved_len_reward_dast": 0.3540695160627365, + "step": 349 + }, + { + "completion_length": 1897.44384765625, + "epoch": 0.9073233959818535, + "grad_norm": 0.20381019828760852, + "kl": 0.022857666015625, + "learning_rate": 1.2253243253327504e-07, + "loss": 0.0392, + "reward": 1.2360577583312988, + "reward_std": 0.4647463858127594, + "rewards/accuracy_reward": 0.7653061151504517, + "rewards/improved_len_reward_dast": 0.470751591026783, + "step": 350 + }, + { + "completion_length": 1563.9234313964844, + "epoch": 0.9099157485418017, + "grad_norm": 0.2149667100915999, + "kl": 0.01705169677734375, + "learning_rate": 1.212733573510154e-07, + "loss": 0.0251, + "reward": 1.484131395816803, + "reward_std": 0.3115840032696724, + "rewards/accuracy_reward": 0.867346927523613, + "rewards/improved_len_reward_dast": 0.6167844533920288, + "step": 351 + }, + { + "completion_length": 1613.438720703125, + "epoch": 0.9125081011017498, + "grad_norm": 0.2397808119710266, + "kl": 0.01849365234375, + "learning_rate": 1.20049626928764e-07, + "loss": 0.0255, + "reward": 1.374268501996994, + "reward_std": 0.3617161624133587, + "rewards/accuracy_reward": 0.8163264989852905, + "rewards/improved_len_reward_dast": 0.5579419583082199, + "step": 352 + }, + { + "completion_length": 1810.8724060058594, + "epoch": 0.915100453661698, + "grad_norm": 0.1952032672447838, + "kl": 0.0240478515625, + "learning_rate": 1.1886134215238539e-07, + "loss": 0.0013, + "reward": 1.2345272898674011, + "reward_std": 0.4293368086218834, + "rewards/accuracy_reward": 0.7602040618658066, + "rewards/improved_len_reward_dast": 0.47432321310043335, + "step": 353 + }, + { + "completion_length": 1323.6071319580078, + "epoch": 0.9176928062216462, + "grad_norm": 0.23544630425662993, + "kl": 0.0150299072265625, + "learning_rate": 1.1770860098556122e-07, + "loss": -0.0126, + "reward": 1.5638253688812256, + "reward_std": 0.3317151963710785, + "rewards/accuracy_reward": 0.9234693795442581, + "rewards/improved_len_reward_dast": 0.6403559893369675, + "step": 354 + }, + { + "completion_length": 1648.1122436523438, + "epoch": 0.9202851587815943, + "grad_norm": 0.19373617697957926, + "kl": 0.01983642578125, + "learning_rate": 1.1659149846171314e-07, + "loss": -0.0106, + "reward": 1.409626692533493, + "reward_std": 0.3634777031838894, + "rewards/accuracy_reward": 0.8112244606018066, + "rewards/improved_len_reward_dast": 0.5984021797776222, + "step": 355 + }, + { + "completion_length": 1640.484634399414, + "epoch": 0.9228775113415425, + "grad_norm": 0.2139648005259324, + "kl": 0.02065277099609375, + "learning_rate": 1.1551012667616889e-07, + "loss": -0.0041, + "reward": 1.3790205717086792, + "reward_std": 0.3004123643040657, + "rewards/accuracy_reward": 0.8010203987360001, + "rewards/improved_len_reward_dast": 0.5780001431703568, + "step": 356 + }, + { + "completion_length": 1952.6427612304688, + "epoch": 0.9254698639014906, + "grad_norm": 0.20207361431898127, + "kl": 0.027069091796875, + "learning_rate": 1.1446457477856933e-07, + "loss": 0.0274, + "reward": 1.1954913437366486, + "reward_std": 0.30133310705423355, + "rewards/accuracy_reward": 0.7448979467153549, + "rewards/improved_len_reward_dast": 0.450593464076519, + "step": 357 + }, + { + "completion_length": 1666.0816040039062, + "epoch": 0.9280622164614387, + "grad_norm": 0.2020263485504787, + "kl": 0.0185546875, + "learning_rate": 1.1345492896551908e-07, + "loss": -0.0157, + "reward": 1.4352505505084991, + "reward_std": 0.4688113033771515, + "rewards/accuracy_reward": 0.8928571343421936, + "rewards/improved_len_reward_dast": 0.542393408715725, + "step": 358 + }, + { + "completion_length": 1809.0611877441406, + "epoch": 0.9306545690213869, + "grad_norm": 0.2096938589768357, + "kl": 0.020904541015625, + "learning_rate": 1.1248127247348025e-07, + "loss": 0.0384, + "reward": 1.3605789840221405, + "reward_std": 0.35709768906235695, + "rewards/accuracy_reward": 0.8163264989852905, + "rewards/improved_len_reward_dast": 0.544252522289753, + "step": 359 + }, + { + "completion_length": 1797.744857788086, + "epoch": 0.933246921581335, + "grad_norm": 0.21622133027589538, + "kl": 0.02146148681640625, + "learning_rate": 1.1154368557191032e-07, + "loss": 0.0154, + "reward": 1.0935336202383041, + "reward_std": 0.3505462594330311, + "rewards/accuracy_reward": 0.6938775479793549, + "rewards/improved_len_reward_dast": 0.3996560573577881, + "step": 360 + }, + { + "completion_length": 1433.0765075683594, + "epoch": 0.9358392741412832, + "grad_norm": 0.22187489868295793, + "kl": 0.0160064697265625, + "learning_rate": 1.1064224555664489e-07, + "loss": -0.0178, + "reward": 1.2581793367862701, + "reward_std": 0.4055371508002281, + "rewards/accuracy_reward": 0.806122437119484, + "rewards/improved_len_reward_dast": 0.4520568624138832, + "step": 361 + }, + { + "completion_length": 1678.2703857421875, + "epoch": 0.9384316267012314, + "grad_norm": 0.18769832722230134, + "kl": 0.0196075439453125, + "learning_rate": 1.0977702674352485e-07, + "loss": 0.0061, + "reward": 1.533081442117691, + "reward_std": 0.24393456988036633, + "rewards/accuracy_reward": 0.8673469126224518, + "rewards/improved_len_reward_dast": 0.6657344847917557, + "step": 362 + }, + { + "completion_length": 1496.3112030029297, + "epoch": 0.9410239792611795, + "grad_norm": 0.2409591218430649, + "kl": 0.01830291748046875, + "learning_rate": 1.0894810046227007e-07, + "loss": 0.0454, + "reward": 1.3800479769706726, + "reward_std": 0.3536526523530483, + "rewards/accuracy_reward": 0.8316326439380646, + "rewards/improved_len_reward_dast": 0.548415370285511, + "step": 363 + }, + { + "completion_length": 1296.9234313964844, + "epoch": 0.9436163318211277, + "grad_norm": 0.2065960957661233, + "kl": 0.014404296875, + "learning_rate": 1.0815553505059864e-07, + "loss": 0.0346, + "reward": 1.4174171388149261, + "reward_std": 0.3700226917862892, + "rewards/accuracy_reward": 0.8673469126224518, + "rewards/improved_len_reward_dast": 0.5500702187418938, + "step": 364 + }, + { + "completion_length": 1770.8111572265625, + "epoch": 0.9462086843810759, + "grad_norm": 0.22025176867987864, + "kl": 0.0205535888671875, + "learning_rate": 1.0739939584859327e-07, + "loss": 0.0372, + "reward": 1.2784855961799622, + "reward_std": 0.40080468729138374, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.4876692369580269, + "step": 365 + }, + { + "completion_length": 2252.9540405273438, + "epoch": 0.948801036941024, + "grad_norm": 0.25202994466231426, + "kl": 0.028900146484375, + "learning_rate": 1.066797451933144e-07, + "loss": 0.0538, + "reward": 1.052029862999916, + "reward_std": 0.4297824278473854, + "rewards/accuracy_reward": 0.6734693944454193, + "rewards/improved_len_reward_dast": 0.37856047973036766, + "step": 366 + }, + { + "completion_length": 1675.0867309570312, + "epoch": 0.9513933895009722, + "grad_norm": 0.18981437618840255, + "kl": 0.019775390625, + "learning_rate": 1.0599664241366108e-07, + "loss": 0.0215, + "reward": 1.4016070365905762, + "reward_std": 0.4491507261991501, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.5444641783833504, + "step": 367 + }, + { + "completion_length": 2051.3162536621094, + "epoch": 0.9539857420609202, + "grad_norm": 0.18988751309956323, + "kl": 0.0218658447265625, + "learning_rate": 1.0535014382547976e-07, + "loss": -0.0024, + "reward": 1.3321772515773773, + "reward_std": 0.5532524138689041, + "rewards/accuracy_reward": 0.8418367207050323, + "rewards/improved_len_reward_dast": 0.4903404861688614, + "step": 368 + }, + { + "completion_length": 1725.3927917480469, + "epoch": 0.9565780946208684, + "grad_norm": 0.26332331622328803, + "kl": 0.02056884765625, + "learning_rate": 1.0474030272692176e-07, + "loss": -0.0428, + "reward": 1.1207705438137054, + "reward_std": 0.582356795668602, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.33505629003047943, + "step": 369 + }, + { + "completion_length": 1730.3264465332031, + "epoch": 0.9591704471808166, + "grad_norm": 0.23147600575876767, + "kl": 0.020355224609375, + "learning_rate": 1.0416716939404906e-07, + "loss": 0.0207, + "reward": 1.4236516058444977, + "reward_std": 0.4436470791697502, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.5665087997913361, + "step": 370 + }, + { + "completion_length": 2078.234649658203, + "epoch": 0.9617627997407647, + "grad_norm": 0.18318392619509644, + "kl": 0.02490234375, + "learning_rate": 1.0363079107668965e-07, + "loss": 0.0174, + "reward": 1.2476365268230438, + "reward_std": 0.4425313174724579, + "rewards/accuracy_reward": 0.7704081535339355, + "rewards/improved_len_reward_dast": 0.4772283583879471, + "step": 371 + }, + { + "completion_length": 1901.7754821777344, + "epoch": 0.9643551523007129, + "grad_norm": 0.2045058157665467, + "kl": 0.0230865478515625, + "learning_rate": 1.03131211994542e-07, + "loss": 0.0151, + "reward": 1.1136702597141266, + "reward_std": 0.4208161160349846, + "rewards/accuracy_reward": 0.6989795863628387, + "rewards/improved_len_reward_dast": 0.41469068080186844, + "step": 372 + }, + { + "completion_length": 1673.6377563476562, + "epoch": 0.9669475048606611, + "grad_norm": 0.1953573582384899, + "kl": 0.0203399658203125, + "learning_rate": 1.0266847333352986e-07, + "loss": 0.0144, + "reward": 1.2215417325496674, + "reward_std": 0.3687748461961746, + "rewards/accuracy_reward": 0.8061224222183228, + "rewards/improved_len_reward_dast": 0.4154192693531513, + "step": 373 + }, + { + "completion_length": 1465.4744262695312, + "epoch": 0.9695398574206092, + "grad_norm": 0.2392315039852379, + "kl": 0.020263671875, + "learning_rate": 1.022426132424064e-07, + "loss": 0.0264, + "reward": 1.3526732623577118, + "reward_std": 0.2864141073077917, + "rewards/accuracy_reward": 0.8418367058038712, + "rewards/improved_len_reward_dast": 0.5108365193009377, + "step": 374 + }, + { + "completion_length": 1698.5611877441406, + "epoch": 0.9721322099805574, + "grad_norm": 0.22243506530923526, + "kl": 0.018157958984375, + "learning_rate": 1.0185366682960968e-07, + "loss": 0.0368, + "reward": 1.2421083450317383, + "reward_std": 0.3934044614434242, + "rewards/accuracy_reward": 0.7908163070678711, + "rewards/improved_len_reward_dast": 0.451292023062706, + "step": 375 + }, + { + "completion_length": 1694.5101623535156, + "epoch": 0.9747245625405055, + "grad_norm": 0.2049483563870167, + "kl": 0.02301025390625, + "learning_rate": 1.015016661603677e-07, + "loss": 0.0109, + "reward": 1.2675099819898605, + "reward_std": 0.27898336201906204, + "rewards/accuracy_reward": 0.806122437119484, + "rewards/improved_len_reward_dast": 0.4613875336945057, + "step": 376 + }, + { + "completion_length": 1818.9183349609375, + "epoch": 0.9773169151004537, + "grad_norm": 0.2917301156280802, + "kl": 0.022247314453125, + "learning_rate": 1.011866402540555e-07, + "loss": 0.052, + "reward": 1.2979410141706467, + "reward_std": 0.4051199574023485, + "rewards/accuracy_reward": 0.8010203987360001, + "rewards/improved_len_reward_dast": 0.4969206303358078, + "step": 377 + }, + { + "completion_length": 1676.4030151367188, + "epoch": 0.9799092676604018, + "grad_norm": 0.19999847167358073, + "kl": 0.0189666748046875, + "learning_rate": 1.0090861508180229e-07, + "loss": 0.0173, + "reward": 1.307900682091713, + "reward_std": 0.36051470041275024, + "rewards/accuracy_reward": 0.806122437119484, + "rewards/improved_len_reward_dast": 0.5017782524228096, + "step": 378 + }, + { + "completion_length": 1303.3468933105469, + "epoch": 0.9825016202203499, + "grad_norm": 0.23002851272315084, + "kl": 0.016387939453125, + "learning_rate": 1.006676135643506e-07, + "loss": 0.0223, + "reward": 1.5040651261806488, + "reward_std": 0.28981203213334084, + "rewards/accuracy_reward": 0.8877550810575485, + "rewards/improved_len_reward_dast": 0.6163100153207779, + "step": 379 + }, + { + "completion_length": 1699.98974609375, + "epoch": 0.9850939727802981, + "grad_norm": 0.2773167363062717, + "kl": 0.021759033203125, + "learning_rate": 1.004636555701666e-07, + "loss": -0.0024, + "reward": 1.3300544768571854, + "reward_std": 0.4332263544201851, + "rewards/accuracy_reward": 0.857142835855484, + "rewards/improved_len_reward_dast": 0.47291168570518494, + "step": 380 + }, + { + "completion_length": 2158.5560607910156, + "epoch": 0.9876863253402463, + "grad_norm": 0.19893298725270195, + "kl": 0.027099609375, + "learning_rate": 1.0029675791380211e-07, + "loss": 0.0245, + "reward": 1.366698831319809, + "reward_std": 0.3425176590681076, + "rewards/accuracy_reward": 0.8112244755029678, + "rewards/improved_len_reward_dast": 0.5554743856191635, + "step": 381 + }, + { + "completion_length": 1771.0765075683594, + "epoch": 0.9902786779001944, + "grad_norm": 0.21454331685840108, + "kl": 0.025909423828125, + "learning_rate": 1.0016693435450846e-07, + "loss": 0.0522, + "reward": 1.1434401869773865, + "reward_std": 0.518133670091629, + "rewards/accuracy_reward": 0.7448979467153549, + "rewards/improved_len_reward_dast": 0.39854224771261215, + "step": 382 + }, + { + "completion_length": 1916.8673095703125, + "epoch": 0.9928710304601426, + "grad_norm": 0.21868762838968606, + "kl": 0.0216217041015625, + "learning_rate": 1.00074195595102e-07, + "loss": 0.0149, + "reward": 1.2855271100997925, + "reward_std": 0.4449741840362549, + "rewards/accuracy_reward": 0.7857142686843872, + "rewards/improved_len_reward_dast": 0.4998128265142441, + "step": 383 + }, + { + "completion_length": 1359.0254821777344, + "epoch": 0.9954633830200907, + "grad_norm": 0.22146763439588837, + "kl": 0.01685333251953125, + "learning_rate": 1.0001854928108199e-07, + "loss": -0.0267, + "reward": 1.3678375780582428, + "reward_std": 0.3422878012061119, + "rewards/accuracy_reward": 0.8214285671710968, + "rewards/improved_len_reward_dast": 0.5464089959859848, + "step": 384 + }, + { + "completion_length": 1564.7193908691406, + "epoch": 0.9980557355800389, + "grad_norm": 0.29725903676415294, + "kl": 0.019683837890625, + "learning_rate": 1e-07, + "loss": 0.0597, + "reward": 1.2890927195549011, + "reward_std": 0.3781392499804497, + "rewards/accuracy_reward": 0.795918345451355, + "rewards/improved_len_reward_dast": 0.49317440390586853, + "step": 385 + }, + { + "epoch": 0.9980557355800389, + "step": 385, + "total_flos": 0.0, + "train_loss": 0.0015093988140246698, + "train_runtime": 5817.5821, + "train_samples_per_second": 1.856, + "train_steps_per_second": 0.066 + } + ], + "logging_steps": 1, + "max_steps": 385, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +}