{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9980557355800389, "eval_steps": 500, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1848.5458984375, "epoch": 0.002592352559948153, "grad_norm": 0.15412024450495956, "kl": 0.0, "learning_rate": 2.564102564102564e-08, "loss": 0.0246, "reward": 1.4397025108337402, "reward_std": 0.4701927825808525, "rewards/accuracy_reward": 0.8418367207050323, "rewards/improved_len_reward_dast": 0.5978657752275467, "step": 1 }, { "completion_length": 2130.4540100097656, "epoch": 0.005184705119896306, "grad_norm": 0.19408049978062328, "kl": 0.0, "learning_rate": 5.128205128205128e-08, "loss": 0.0596, "reward": 1.0504228472709656, "reward_std": 0.31693385541439056, "rewards/accuracy_reward": 0.6938775479793549, "rewards/improved_len_reward_dast": 0.3565452881157398, "step": 2 }, { "completion_length": 2034.2958679199219, "epoch": 0.007777057679844459, "grad_norm": 0.1531077683543166, "kl": 0.0001348257064819336, "learning_rate": 7.692307692307692e-08, "loss": -0.0129, "reward": 1.0101122856140137, "reward_std": 0.4455054961144924, "rewards/accuracy_reward": 0.6581632494926453, "rewards/improved_len_reward_dast": 0.3519490174949169, "step": 3 }, { "completion_length": 2119.744903564453, "epoch": 0.010369410239792612, "grad_norm": 0.1349622041652031, "kl": 0.00012981891632080078, "learning_rate": 1.0256410256410256e-07, "loss": -0.0044, "reward": 1.2723601460456848, "reward_std": 0.4871401861310005, "rewards/accuracy_reward": 0.806122437119484, "rewards/improved_len_reward_dast": 0.46623772382736206, "step": 4 }, { "completion_length": 1834.7652893066406, "epoch": 0.012961762799740765, "grad_norm": 0.16434839601505108, "kl": 0.00012123584747314453, "learning_rate": 1.2820512820512818e-07, "loss": 0.0443, "reward": 1.267708569765091, "reward_std": 0.3166223168373108, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.5024024695158005, "step": 5 }, { "completion_length": 2152.540740966797, "epoch": 0.015554115359688918, "grad_norm": 0.15696438577129812, "kl": 0.00012969970703125, "learning_rate": 1.5384615384615385e-07, "loss": -0.0129, "reward": 1.0658827871084213, "reward_std": 0.4334075152873993, "rewards/accuracy_reward": 0.7142857164144516, "rewards/improved_len_reward_dast": 0.35159702971577644, "step": 6 }, { "completion_length": 1747.4591674804688, "epoch": 0.01814646791963707, "grad_norm": 0.15893508336342455, "kl": 0.00010186433792114258, "learning_rate": 1.7948717948717948e-07, "loss": 0.0429, "reward": 1.1448375135660172, "reward_std": 0.37509680539369583, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.3846333734691143, "step": 7 }, { "completion_length": 1834.0611572265625, "epoch": 0.020738820479585224, "grad_norm": 0.1573166657366275, "kl": 0.00011396408081054688, "learning_rate": 2.0512820512820512e-07, "loss": 0.0036, "reward": 1.272167608141899, "reward_std": 0.3015933446586132, "rewards/accuracy_reward": 0.8010203838348389, "rewards/improved_len_reward_dast": 0.47114718705415726, "step": 8 }, { "completion_length": 2077.1122131347656, "epoch": 0.023331173039533377, "grad_norm": 0.15123878128380125, "kl": 0.0001251697540283203, "learning_rate": 2.3076923076923078e-07, "loss": 0.0025, "reward": 1.1346809566020966, "reward_std": 0.44101474434137344, "rewards/accuracy_reward": 0.7448979467153549, "rewards/improved_len_reward_dast": 0.38978295773267746, "step": 9 }, { "completion_length": 2001.6989135742188, "epoch": 0.02592352559948153, "grad_norm": 0.15946517595083978, "kl": 0.00013494491577148438, "learning_rate": 2.5641025641025636e-07, "loss": 0.0414, "reward": 1.0840217173099518, "reward_std": 0.37720372527837753, "rewards/accuracy_reward": 0.7244897931814194, "rewards/improved_len_reward_dast": 0.3595319651067257, "step": 10 }, { "completion_length": 2258.3468322753906, "epoch": 0.028515878159429683, "grad_norm": 0.16258661653616813, "kl": 0.0001423358917236328, "learning_rate": 2.8205128205128203e-07, "loss": -0.0035, "reward": 1.035923331975937, "reward_std": 0.44437722116708755, "rewards/accuracy_reward": 0.6989795863628387, "rewards/improved_len_reward_dast": 0.33694368600845337, "step": 11 }, { "completion_length": 2071.6019897460938, "epoch": 0.031108230719377836, "grad_norm": 0.15520698307030686, "kl": 0.0001367330551147461, "learning_rate": 3.076923076923077e-07, "loss": 0.0151, "reward": 1.1415546834468842, "reward_std": 0.37767674773931503, "rewards/accuracy_reward": 0.7653061002492905, "rewards/improved_len_reward_dast": 0.3762484937906265, "step": 12 }, { "completion_length": 1976.1530151367188, "epoch": 0.033700583279325985, "grad_norm": 0.17189810461087038, "kl": 0.00012564659118652344, "learning_rate": 3.333333333333333e-07, "loss": 0.0019, "reward": 1.125291794538498, "reward_std": 0.4003720059990883, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.36508774384856224, "step": 13 }, { "completion_length": 2114.5612182617188, "epoch": 0.03629293583927414, "grad_norm": 0.18307106761606742, "kl": 0.00011533498764038086, "learning_rate": 3.5897435897435896e-07, "loss": 0.0248, "reward": 1.0526445508003235, "reward_std": 0.33728349953889847, "rewards/accuracy_reward": 0.6530612111091614, "rewards/improved_len_reward_dast": 0.3995833285152912, "step": 14 }, { "completion_length": 1440.3571166992188, "epoch": 0.03888528839922229, "grad_norm": 0.19219239961861387, "kl": 7.677078247070312e-05, "learning_rate": 3.8461538461538463e-07, "loss": 0.0411, "reward": 1.3660516738891602, "reward_std": 0.2804589569568634, "rewards/accuracy_reward": 0.9030611962080002, "rewards/improved_len_reward_dast": 0.46299050748348236, "step": 15 }, { "completion_length": 1305.2295684814453, "epoch": 0.04147764095917045, "grad_norm": 0.18960595204343547, "kl": 9.632110595703125e-05, "learning_rate": 4.1025641025641024e-07, "loss": 0.0021, "reward": 1.418413519859314, "reward_std": 0.44618362933397293, "rewards/accuracy_reward": 0.9132652878761292, "rewards/improved_len_reward_dast": 0.5051482394337654, "step": 16 }, { "completion_length": 1996.841796875, "epoch": 0.0440699935191186, "grad_norm": 0.16908596036858053, "kl": 0.00011074542999267578, "learning_rate": 4.358974358974359e-07, "loss": 0.0341, "reward": 1.1314191222190857, "reward_std": 0.6118374243378639, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.37121502310037613, "step": 17 }, { "completion_length": 1431.4846801757812, "epoch": 0.046662346079066754, "grad_norm": 0.22735446925703126, "kl": 8.571147918701172e-05, "learning_rate": 4.6153846153846156e-07, "loss": 0.0407, "reward": 1.206202208995819, "reward_std": 0.3719758912920952, "rewards/accuracy_reward": 0.8469387739896774, "rewards/improved_len_reward_dast": 0.3592635001987219, "step": 18 }, { "completion_length": 1709.688720703125, "epoch": 0.0492546986390149, "grad_norm": 0.18577878700500422, "kl": 0.00010377168655395508, "learning_rate": 4.871794871794871e-07, "loss": 0.0417, "reward": 1.1775241941213608, "reward_std": 0.5288017690181732, "rewards/accuracy_reward": 0.7806122303009033, "rewards/improved_len_reward_dast": 0.39691203087568283, "step": 19 }, { "completion_length": 1838.2754821777344, "epoch": 0.05184705119896306, "grad_norm": 0.16046849749418657, "kl": 0.00011777877807617188, "learning_rate": 5.128205128205127e-07, "loss": 0.0208, "reward": 1.1064813733100891, "reward_std": 0.5807419717311859, "rewards/accuracy_reward": 0.7551020234823227, "rewards/improved_len_reward_dast": 0.3513793312013149, "step": 20 }, { "completion_length": 2217.14794921875, "epoch": 0.05443940375891121, "grad_norm": 0.1963426577198746, "kl": 0.00014448165893554688, "learning_rate": 5.384615384615384e-07, "loss": 0.0467, "reward": 1.0558834075927734, "reward_std": 0.558340422809124, "rewards/accuracy_reward": 0.6887754797935486, "rewards/improved_len_reward_dast": 0.36710788309574127, "step": 21 }, { "completion_length": 1927.3316040039062, "epoch": 0.057031756318859365, "grad_norm": 0.18525325793381328, "kl": 9.930133819580078e-05, "learning_rate": 5.641025641025641e-07, "loss": 0.0242, "reward": 1.1790167838335037, "reward_std": 0.4690204933285713, "rewards/accuracy_reward": 0.7857142835855484, "rewards/improved_len_reward_dast": 0.39330248534679413, "step": 22 }, { "completion_length": 1841.6938171386719, "epoch": 0.059624108878807515, "grad_norm": 0.17253945143916685, "kl": 0.00010156631469726562, "learning_rate": 5.897435897435898e-07, "loss": 0.0724, "reward": 1.3324860334396362, "reward_std": 0.28684910759329796, "rewards/accuracy_reward": 0.8010203987360001, "rewards/improved_len_reward_dast": 0.5314656794071198, "step": 23 }, { "completion_length": 1679.9642333984375, "epoch": 0.06221646143875567, "grad_norm": 0.20870673606371046, "kl": 0.00012958049774169922, "learning_rate": 6.153846153846154e-07, "loss": 0.0467, "reward": 1.1419631987810135, "reward_std": 0.38880112022161484, "rewards/accuracy_reward": 0.8010203838348389, "rewards/improved_len_reward_dast": 0.3409428298473358, "step": 24 }, { "completion_length": 2278.8673095703125, "epoch": 0.06480881399870382, "grad_norm": 0.15316366458717245, "kl": 0.00015485286712646484, "learning_rate": 6.410256410256411e-07, "loss": 0.0203, "reward": 0.9916537553071976, "reward_std": 0.43884778022766113, "rewards/accuracy_reward": 0.6479591578245163, "rewards/improved_len_reward_dast": 0.3436945825815201, "step": 25 }, { "completion_length": 1853.4744873046875, "epoch": 0.06740116655865197, "grad_norm": 0.1623211083206233, "kl": 0.0001201629638671875, "learning_rate": 6.666666666666666e-07, "loss": 0.054, "reward": 1.1868394315242767, "reward_std": 0.4521937184035778, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.42663537338376045, "step": 26 }, { "completion_length": 1726.6427917480469, "epoch": 0.06999351911860013, "grad_norm": 0.21873628771810408, "kl": 0.0001125335693359375, "learning_rate": 6.923076923076922e-07, "loss": 0.086, "reward": 1.2924230992794037, "reward_std": 0.41079702973365784, "rewards/accuracy_reward": 0.8418367356061935, "rewards/improved_len_reward_dast": 0.45058638602495193, "step": 27 }, { "completion_length": 1667.6071166992188, "epoch": 0.07258587167854828, "grad_norm": 0.18905776966101132, "kl": 0.00011527538299560547, "learning_rate": 7.179487179487179e-07, "loss": 0.045, "reward": 1.2638164162635803, "reward_std": 0.2763877250254154, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.4525919631123543, "step": 28 }, { "completion_length": 2032.4132080078125, "epoch": 0.07517822423849643, "grad_norm": 0.15326481666027458, "kl": 0.00012993812561035156, "learning_rate": 7.435897435897435e-07, "loss": 0.0002, "reward": 1.1888954937458038, "reward_std": 0.41189244389533997, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.42869146168231964, "step": 29 }, { "completion_length": 1764.4999389648438, "epoch": 0.07777057679844458, "grad_norm": 0.13723640714210214, "kl": 9.167194366455078e-05, "learning_rate": 7.692307692307693e-07, "loss": -0.0066, "reward": 1.0674456059932709, "reward_std": 0.4443123862147331, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.2970374431461096, "step": 30 }, { "completion_length": 2198.729522705078, "epoch": 0.08036292935839275, "grad_norm": 0.15079546325320037, "kl": 0.0001614093780517578, "learning_rate": 7.948717948717948e-07, "loss": 0.013, "reward": 1.3089748322963715, "reward_std": 0.5274734199047089, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.48754626512527466, "step": 31 }, { "completion_length": 1879.6376647949219, "epoch": 0.0829552819183409, "grad_norm": 0.18155740478939822, "kl": 0.0001251697540283203, "learning_rate": 8.205128205128205e-07, "loss": 0.0131, "reward": 1.0791111141443253, "reward_std": 0.46941038966178894, "rewards/accuracy_reward": 0.7346938699483871, "rewards/improved_len_reward_dast": 0.34441729076206684, "step": 32 }, { "completion_length": 1981.6274719238281, "epoch": 0.08554763447828904, "grad_norm": 0.1572483646834791, "kl": 0.0001424551010131836, "learning_rate": 8.461538461538461e-07, "loss": 0.0476, "reward": 1.3903695046901703, "reward_std": 0.4975530132651329, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.5332267209887505, "step": 33 }, { "completion_length": 2061.9999389648438, "epoch": 0.0881399870382372, "grad_norm": 0.1901994694040778, "kl": 0.0001537799835205078, "learning_rate": 8.717948717948718e-07, "loss": 0.0481, "reward": 1.1052793562412262, "reward_std": 0.4630768448114395, "rewards/accuracy_reward": 0.7448979467153549, "rewards/improved_len_reward_dast": 0.36038143932819366, "step": 34 }, { "completion_length": 2465.1224060058594, "epoch": 0.09073233959818536, "grad_norm": 0.15096654762075654, "kl": 0.0001761913299560547, "learning_rate": 8.974358974358974e-07, "loss": 0.0009, "reward": 0.7364223003387451, "reward_std": 0.4229283332824707, "rewards/accuracy_reward": 0.5357142835855484, "rewards/improved_len_reward_dast": 0.20070804562419653, "step": 35 }, { "completion_length": 2199.688720703125, "epoch": 0.09332469215813351, "grad_norm": 0.1791438585472734, "kl": 0.0001895427703857422, "learning_rate": 9.230769230769231e-07, "loss": 0.0399, "reward": 1.2042141258716583, "reward_std": 0.3516070544719696, "rewards/accuracy_reward": 0.7755101770162582, "rewards/improved_len_reward_dast": 0.4287039190530777, "step": 36 }, { "completion_length": 2019.6478881835938, "epoch": 0.09591704471808166, "grad_norm": 0.1921872688604767, "kl": 0.00020241737365722656, "learning_rate": 9.487179487179486e-07, "loss": 0.0187, "reward": 1.3608680367469788, "reward_std": 0.4326165243983269, "rewards/accuracy_reward": 0.8316326439380646, "rewards/improved_len_reward_dast": 0.5292353481054306, "step": 37 }, { "completion_length": 1693.0, "epoch": 0.0985093972780298, "grad_norm": 0.19045468187511366, "kl": 0.0001348257064819336, "learning_rate": 9.743589743589742e-07, "loss": 0.0464, "reward": 1.3455627113580704, "reward_std": 0.3586850240826607, "rewards/accuracy_reward": 0.846938744187355, "rewards/improved_len_reward_dast": 0.49862393736839294, "step": 38 }, { "completion_length": 2374.637725830078, "epoch": 0.10110174983797797, "grad_norm": 0.13494398794899917, "kl": 0.0002028942108154297, "learning_rate": 1e-06, "loss": 0.0272, "reward": 0.8414318859577179, "reward_std": 0.48852086812257767, "rewards/accuracy_reward": 0.6224489659070969, "rewards/improved_len_reward_dast": 0.21898294147104025, "step": 39 }, { "completion_length": 2517.3162841796875, "epoch": 0.10369410239792612, "grad_norm": 0.16744933736297124, "kl": 0.0002105236053466797, "learning_rate": 9.99981450718918e-07, "loss": 0.0616, "reward": 0.9213714599609375, "reward_std": 0.43374133110046387, "rewards/accuracy_reward": 0.6275510042905807, "rewards/improved_len_reward_dast": 0.2938204384408891, "step": 40 }, { "completion_length": 1807.0203857421875, "epoch": 0.10628645495787427, "grad_norm": 0.15669439739322064, "kl": 0.0002703666687011719, "learning_rate": 9.99925804404898e-07, "loss": 0.0228, "reward": 0.994490772485733, "reward_std": 0.5202224850654602, "rewards/accuracy_reward": 0.7193877547979355, "rewards/improved_len_reward_dast": 0.27510301768779755, "step": 41 }, { "completion_length": 1907.0305786132812, "epoch": 0.10887880751782242, "grad_norm": 0.1507066292700219, "kl": 0.00019288063049316406, "learning_rate": 9.998330656454915e-07, "loss": 0.0566, "reward": 1.3084075152873993, "reward_std": 0.3637009263038635, "rewards/accuracy_reward": 0.8367346823215485, "rewards/improved_len_reward_dast": 0.4716728553175926, "step": 42 }, { "completion_length": 1946.2958984375, "epoch": 0.11147116007777058, "grad_norm": 0.21826053334493506, "kl": 0.0002913475036621094, "learning_rate": 9.99703242086198e-07, "loss": 0.0894, "reward": 1.0715700536966324, "reward_std": 0.4503963589668274, "rewards/accuracy_reward": 0.7397958934307098, "rewards/improved_len_reward_dast": 0.3317741868086159, "step": 43 }, { "completion_length": 1862.9591674804688, "epoch": 0.11406351263771873, "grad_norm": 0.18297677442826724, "kl": 0.000263214111328125, "learning_rate": 9.995363444298333e-07, "loss": 0.037, "reward": 1.2490134239196777, "reward_std": 0.4328879788517952, "rewards/accuracy_reward": 0.7653061076998711, "rewards/improved_len_reward_dast": 0.4837072864174843, "step": 44 }, { "completion_length": 2316.530517578125, "epoch": 0.11665586519766688, "grad_norm": 0.15141936649503004, "kl": 0.0003380775451660156, "learning_rate": 9.993323864356492e-07, "loss": 0.0182, "reward": 0.7743872255086899, "reward_std": 0.55930295586586, "rewards/accuracy_reward": 0.5765305981040001, "rewards/improved_len_reward_dast": 0.19785663951188326, "step": 45 }, { "completion_length": 2924.1683349609375, "epoch": 0.11924821775761503, "grad_norm": 0.12614913947783052, "kl": 0.0002567768096923828, "learning_rate": 9.990913849181977e-07, "loss": 0.0096, "reward": 0.8433035537600517, "reward_std": 0.41744476184248924, "rewards/accuracy_reward": 0.5561224333941936, "rewards/improved_len_reward_dast": 0.28718107007443905, "step": 46 }, { "completion_length": 1805.5203552246094, "epoch": 0.1218405703175632, "grad_norm": 0.15881163011201838, "kl": 0.0007009506225585938, "learning_rate": 9.988133597459444e-07, "loss": 0.0175, "reward": 1.1679251790046692, "reward_std": 0.4487800747156143, "rewards/accuracy_reward": 0.795918345451355, "rewards/improved_len_reward_dast": 0.3720068037509918, "step": 47 }, { "completion_length": 1873.7499389648438, "epoch": 0.12443292287751134, "grad_norm": 0.1713187626068608, "kl": 0.00028634071350097656, "learning_rate": 9.984983338396323e-07, "loss": 0.0488, "reward": 1.2101139575242996, "reward_std": 0.33226554840803146, "rewards/accuracy_reward": 0.760204091668129, "rewards/improved_len_reward_dast": 0.44990991055965424, "step": 48 }, { "completion_length": 1411.4234161376953, "epoch": 0.1270252754374595, "grad_norm": 0.18215178056260903, "kl": 0.0005662441253662109, "learning_rate": 9.981463331703903e-07, "loss": 0.0348, "reward": 1.4565084278583527, "reward_std": 0.3240164965391159, "rewards/accuracy_reward": 0.867346927523613, "rewards/improved_len_reward_dast": 0.5891614705324173, "step": 49 }, { "completion_length": 1923.6836547851562, "epoch": 0.12961762799740764, "grad_norm": 0.21182741369137464, "kl": 0.00043964385986328125, "learning_rate": 9.977573867575937e-07, "loss": 0.0483, "reward": 1.0672244429588318, "reward_std": 0.42784378305077553, "rewards/accuracy_reward": 0.7244897782802582, "rewards/improved_len_reward_dast": 0.342734657227993, "step": 50 }, { "completion_length": 2293.10205078125, "epoch": 0.1322099805573558, "grad_norm": 0.17784622321620705, "kl": 0.0005965232849121094, "learning_rate": 9.9733152666647e-07, "loss": 0.0011, "reward": 1.119166985154152, "reward_std": 0.4692757725715637, "rewards/accuracy_reward": 0.6836734563112259, "rewards/improved_len_reward_dast": 0.43549349159002304, "step": 51 }, { "completion_length": 2606.8468627929688, "epoch": 0.13480233311730394, "grad_norm": 0.16188767449887392, "kl": 0.0004382133483886719, "learning_rate": 9.968687880054579e-07, "loss": 0.0355, "reward": 1.0624671429395676, "reward_std": 0.5272083953022957, "rewards/accuracy_reward": 0.6530612111091614, "rewards/improved_len_reward_dast": 0.4094058535993099, "step": 52 }, { "completion_length": 1741.494873046875, "epoch": 0.1373946856772521, "grad_norm": 0.18163262147540796, "kl": 0.0007987022399902344, "learning_rate": 9.963692089233104e-07, "loss": 0.0189, "reward": 1.1586879789829254, "reward_std": 0.3523149788379669, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.3678716644644737, "step": 53 }, { "completion_length": 1731.5713806152344, "epoch": 0.13998703823720027, "grad_norm": 0.17545616003222686, "kl": 0.000713348388671875, "learning_rate": 9.958328306059508e-07, "loss": 0.0163, "reward": 1.087464839220047, "reward_std": 0.37970298528671265, "rewards/accuracy_reward": 0.7499999701976776, "rewards/improved_len_reward_dast": 0.3374648429453373, "step": 54 }, { "completion_length": 1940.2244262695312, "epoch": 0.1425793907971484, "grad_norm": 0.20829916863603212, "kl": 0.0008840560913085938, "learning_rate": 9.952596972730782e-07, "loss": 0.0418, "reward": 1.136895164847374, "reward_std": 0.21965472772717476, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.37158904783427715, "step": 55 }, { "completion_length": 2024.3825988769531, "epoch": 0.14517174335709657, "grad_norm": 0.16061899047482414, "kl": 0.0006990432739257812, "learning_rate": 9.946498561745201e-07, "loss": 0.0061, "reward": 1.3091870546340942, "reward_std": 0.42107394337654114, "rewards/accuracy_reward": 0.8010203987360001, "rewards/improved_len_reward_dast": 0.50816660374403, "step": 56 }, { "completion_length": 1990.7856750488281, "epoch": 0.14776409591704473, "grad_norm": 0.17205784813401187, "kl": 0.0008096694946289062, "learning_rate": 9.94003357586339e-07, "loss": 0.0362, "reward": 1.3399446904659271, "reward_std": 0.34059275686740875, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.5185160860419273, "step": 57 }, { "completion_length": 2279.331573486328, "epoch": 0.15035644847699287, "grad_norm": 0.1637215457597632, "kl": 0.0006699562072753906, "learning_rate": 9.933202548066855e-07, "loss": 0.0424, "reward": 1.0715169459581375, "reward_std": 0.39220181107521057, "rewards/accuracy_reward": 0.6887754946947098, "rewards/improved_len_reward_dast": 0.38274142518639565, "step": 58 }, { "completion_length": 2313.2499084472656, "epoch": 0.15294880103694103, "grad_norm": 0.16376379786761341, "kl": 0.00083160400390625, "learning_rate": 9.926006041514068e-07, "loss": 0.0178, "reward": 1.142714947462082, "reward_std": 0.3937602676451206, "rewards/accuracy_reward": 0.739795908331871, "rewards/improved_len_reward_dast": 0.40291906148195267, "step": 59 }, { "completion_length": 2046.1631774902344, "epoch": 0.15554115359688916, "grad_norm": 0.23236942157628335, "kl": 0.0009450912475585938, "learning_rate": 9.918444649494012e-07, "loss": 0.0662, "reward": 1.245220124721527, "reward_std": 0.2695602234452963, "rewards/accuracy_reward": 0.7755101770162582, "rewards/improved_len_reward_dast": 0.46970994770526886, "step": 60 }, { "completion_length": 2175.6224060058594, "epoch": 0.15813350615683733, "grad_norm": 0.15376927864805173, "kl": 0.0009765625, "learning_rate": 9.9105189953773e-07, "loss": 0.0196, "reward": 1.2470524311065674, "reward_std": 0.45635347813367844, "rewards/accuracy_reward": 0.7653061002492905, "rewards/improved_len_reward_dast": 0.48174627125263214, "step": 61 }, { "completion_length": 2337.1581115722656, "epoch": 0.1607258587167855, "grad_norm": 0.15218316765828901, "kl": 0.0008411407470703125, "learning_rate": 9.90222973256475e-07, "loss": 0.0249, "reward": 1.37412428855896, "reward_std": 0.39829079806804657, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.552695706486702, "step": 62 }, { "completion_length": 2680.4183349609375, "epoch": 0.16331821127673363, "grad_norm": 0.21218309711028285, "kl": 0.0010118484497070312, "learning_rate": 9.89357754443355e-07, "loss": 0.0529, "reward": 0.8223338723182678, "reward_std": 0.4073232337832451, "rewards/accuracy_reward": 0.5510203987360001, "rewards/improved_len_reward_dast": 0.2713134288787842, "step": 63 }, { "completion_length": 2635.7550048828125, "epoch": 0.1659105638366818, "grad_norm": 0.1620590183136494, "kl": 0.000949859619140625, "learning_rate": 9.884563144280897e-07, "loss": 0.0464, "reward": 1.0863047987222672, "reward_std": 0.4714929535984993, "rewards/accuracy_reward": 0.678571417927742, "rewards/improved_len_reward_dast": 0.40773337706923485, "step": 64 }, { "completion_length": 1972.2907104492188, "epoch": 0.16850291639662995, "grad_norm": 0.17935605548712222, "kl": 0.001079559326171875, "learning_rate": 9.875187275265198e-07, "loss": 0.0255, "reward": 1.2364896833896637, "reward_std": 0.4289153516292572, "rewards/accuracy_reward": 0.7959183603525162, "rewards/improved_len_reward_dast": 0.44057128578424454, "step": 65 }, { "completion_length": 2525.2091064453125, "epoch": 0.1710952689565781, "grad_norm": 0.14682421707314297, "kl": 0.0012102127075195312, "learning_rate": 9.865450710344807e-07, "loss": 0.0344, "reward": 0.8753379732370377, "reward_std": 0.3238606099039316, "rewards/accuracy_reward": 0.5918367132544518, "rewards/improved_len_reward_dast": 0.2835012301802635, "step": 66 }, { "completion_length": 2308.1478576660156, "epoch": 0.17368762151652625, "grad_norm": 0.17311806443951758, "kl": 0.001552581787109375, "learning_rate": 9.855354252214307e-07, "loss": 0.0564, "reward": 1.152388408780098, "reward_std": 0.4479888826608658, "rewards/accuracy_reward": 0.7653061002492905, "rewards/improved_len_reward_dast": 0.3870823085308075, "step": 67 }, { "completion_length": 1699.9540405273438, "epoch": 0.1762799740764744, "grad_norm": 0.18795647394996712, "kl": 0.0012683868408203125, "learning_rate": 9.844898733238311e-07, "loss": 0.0538, "reward": 1.4352277517318726, "reward_std": 0.30926575139164925, "rewards/accuracy_reward": 0.867346927523613, "rewards/improved_len_reward_dast": 0.5678808689117432, "step": 68 }, { "completion_length": 1942.3876953125, "epoch": 0.17887232663642255, "grad_norm": 0.2210659776524768, "kl": 0.0016345977783203125, "learning_rate": 9.83408501538287e-07, "loss": -0.0183, "reward": 1.0560709834098816, "reward_std": 0.44945112615823746, "rewards/accuracy_reward": 0.7346938699483871, "rewards/improved_len_reward_dast": 0.32137710228562355, "step": 69 }, { "completion_length": 1671.9642639160156, "epoch": 0.18146467919637072, "grad_norm": 0.19750773670302219, "kl": 0.0015382766723632812, "learning_rate": 9.822913990144387e-07, "loss": 0.0167, "reward": 1.1308622658252716, "reward_std": 0.4337487518787384, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.2737194411456585, "step": 70 }, { "completion_length": 2116.3571166992188, "epoch": 0.18405703175631885, "grad_norm": 0.1778004806410334, "kl": 0.00168609619140625, "learning_rate": 9.811386578476146e-07, "loss": 0.0029, "reward": 1.2179836481809616, "reward_std": 0.46442168205976486, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.457779623568058, "step": 71 }, { "completion_length": 1906.9795532226562, "epoch": 0.18664938431626701, "grad_norm": 0.1986625505084921, "kl": 0.001316070556640625, "learning_rate": 9.79950373071236e-07, "loss": 0.0285, "reward": 1.1908049881458282, "reward_std": 0.3781607896089554, "rewards/accuracy_reward": 0.7244897931814194, "rewards/improved_len_reward_dast": 0.4663151800632477, "step": 72 }, { "completion_length": 1938.2652587890625, "epoch": 0.18924173687621518, "grad_norm": 0.178605084347928, "kl": 0.001659393310546875, "learning_rate": 9.787266426489845e-07, "loss": 0.0145, "reward": 1.233821153640747, "reward_std": 0.40631671994924545, "rewards/accuracy_reward": 0.7704081386327744, "rewards/improved_len_reward_dast": 0.46341295540332794, "step": 73 }, { "completion_length": 2097.5152587890625, "epoch": 0.1918340894361633, "grad_norm": 0.21993776817198404, "kl": 0.0017414093017578125, "learning_rate": 9.77467567466725e-07, "loss": 0.0586, "reward": 1.0030385106801987, "reward_std": 0.48096026852726936, "rewards/accuracy_reward": 0.6989795863628387, "rewards/improved_len_reward_dast": 0.30405890196561813, "step": 74 }, { "completion_length": 2267.7193298339844, "epoch": 0.19442644199611148, "grad_norm": 0.25966079935566605, "kl": 0.002155303955078125, "learning_rate": 9.761732513241882e-07, "loss": 0.1164, "reward": 1.1867494583129883, "reward_std": 0.36580438911914825, "rewards/accuracy_reward": 0.7346938699483871, "rewards/improved_len_reward_dast": 0.45205555111169815, "step": 75 }, { "completion_length": 1932.4285278320312, "epoch": 0.1970187945560596, "grad_norm": 0.18810468542751257, "kl": 0.0028076171875, "learning_rate": 9.748438009264142e-07, "loss": 0.0311, "reward": 1.302773892879486, "reward_std": 0.3699945732951164, "rewards/accuracy_reward": 0.8265306055545807, "rewards/improved_len_reward_dast": 0.4762432426214218, "step": 76 }, { "completion_length": 2192.2601928710938, "epoch": 0.19961114711600778, "grad_norm": 0.1818517530996337, "kl": 0.002178192138671875, "learning_rate": 9.734793258749538e-07, "loss": 0.0556, "reward": 1.2119455933570862, "reward_std": 0.33562129363417625, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.4517414830625057, "step": 77 }, { "completion_length": 2217.4693298339844, "epoch": 0.20220349967595594, "grad_norm": 0.17001135134898285, "kl": 0.002323150634765625, "learning_rate": 9.720799386588358e-07, "loss": 0.0214, "reward": 1.0081346929073334, "reward_std": 0.5323201268911362, "rewards/accuracy_reward": 0.6938775479793549, "rewards/improved_len_reward_dast": 0.3142571374773979, "step": 78 }, { "completion_length": 2039.5867309570312, "epoch": 0.20479585223590407, "grad_norm": 0.19848985839460778, "kl": 0.002605438232421875, "learning_rate": 9.706457546452898e-07, "loss": 0.0507, "reward": 1.1386294960975647, "reward_std": 0.3946889452636242, "rewards/accuracy_reward": 0.7448979541659355, "rewards/improved_len_reward_dast": 0.3937314935028553, "step": 79 }, { "completion_length": 2590.5305786132812, "epoch": 0.20738820479585224, "grad_norm": 0.15129066062202914, "kl": 0.002803802490234375, "learning_rate": 9.691768920702379e-07, "loss": -0.0267, "reward": 0.8391379117965698, "reward_std": 0.39438748359680176, "rewards/accuracy_reward": 0.5765306055545807, "rewards/improved_len_reward_dast": 0.26260728016495705, "step": 80 }, { "completion_length": 2176.096893310547, "epoch": 0.2099805573558004, "grad_norm": 0.18394525455650038, "kl": 0.00240325927734375, "learning_rate": 9.676734720285456e-07, "loss": 0.0667, "reward": 1.148956298828125, "reward_std": 0.34060123562812805, "rewards/accuracy_reward": 0.7448979467153549, "rewards/improved_len_reward_dast": 0.4040583297610283, "step": 81 }, { "completion_length": 2104.994842529297, "epoch": 0.21257290991574854, "grad_norm": 0.1783774193001553, "kl": 0.00263214111328125, "learning_rate": 9.661356184640394e-07, "loss": 0.0607, "reward": 1.300699919462204, "reward_std": 0.29261183738708496, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.5149856060743332, "step": 82 }, { "completion_length": 2017.9591674804688, "epoch": 0.2151652624756967, "grad_norm": 0.20548002392363018, "kl": 0.003589630126953125, "learning_rate": 9.64563458159288e-07, "loss": 0.0372, "reward": 1.2817473858594894, "reward_std": 0.42862868309020996, "rewards/accuracy_reward": 0.8265305906534195, "rewards/improved_len_reward_dast": 0.45521679520606995, "step": 83 }, { "completion_length": 2365.132568359375, "epoch": 0.21775761503564484, "grad_norm": 0.2118006180262065, "kl": 0.003673553466796875, "learning_rate": 9.629571207251515e-07, "loss": 0.0474, "reward": 1.1858174800872803, "reward_std": 0.42872869968414307, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.4256134256720543, "step": 84 }, { "completion_length": 2227.8111572265625, "epoch": 0.220349967595593, "grad_norm": 0.1730257242071835, "kl": 0.0032958984375, "learning_rate": 9.613167385900944e-07, "loss": 0.0116, "reward": 0.9865487962961197, "reward_std": 0.30924591794610023, "rewards/accuracy_reward": 0.6887754946947098, "rewards/improved_len_reward_dast": 0.2977732727304101, "step": 85 }, { "completion_length": 2069.8213806152344, "epoch": 0.22294232015554116, "grad_norm": 0.1997054811852766, "kl": 0.003353118896484375, "learning_rate": 9.59642446989269e-07, "loss": 0.0275, "reward": 1.2090528905391693, "reward_std": 0.4271962344646454, "rewards/accuracy_reward": 0.7806122303009033, "rewards/improved_len_reward_dast": 0.428440660238266, "step": 86 }, { "completion_length": 2234.255096435547, "epoch": 0.2255346727154893, "grad_norm": 0.1689278406473576, "kl": 0.0041046142578125, "learning_rate": 9.579343839533668e-07, "loss": 0.0395, "reward": 1.1342998147010803, "reward_std": 0.3173440955579281, "rewards/accuracy_reward": 0.739795908331871, "rewards/improved_len_reward_dast": 0.3945038840174675, "step": 87 }, { "completion_length": 2258.3009643554688, "epoch": 0.22812702527543746, "grad_norm": 0.19449538540190586, "kl": 0.004421234130859375, "learning_rate": 9.561926902972378e-07, "loss": 0.0785, "reward": 1.2548484802246094, "reward_std": 0.3709937259554863, "rewards/accuracy_reward": 0.7755101770162582, "rewards/improved_len_reward_dast": 0.47933831810951233, "step": 88 }, { "completion_length": 1870.6989440917969, "epoch": 0.23071937783538563, "grad_norm": 0.1864398126735164, "kl": 0.0042266845703125, "learning_rate": 9.544175096082838e-07, "loss": 0.0646, "reward": 1.4300118386745453, "reward_std": 0.4286029487848282, "rewards/accuracy_reward": 0.8928571343421936, "rewards/improved_len_reward_dast": 0.5371547788381577, "step": 89 }, { "completion_length": 2082.653045654297, "epoch": 0.23331173039533376, "grad_norm": 0.17766778571294792, "kl": 0.00475311279296875, "learning_rate": 9.526089882346172e-07, "loss": 0.032, "reward": 1.1855316758155823, "reward_std": 0.36463288590312004, "rewards/accuracy_reward": 0.7551020085811615, "rewards/improved_len_reward_dast": 0.4304296597838402, "step": 90 }, { "completion_length": 2117.2244262695312, "epoch": 0.23590408295528192, "grad_norm": 0.19874233088672905, "kl": 0.003894805908203125, "learning_rate": 9.507672752730001e-07, "loss": 0.052, "reward": 1.0779342502355576, "reward_std": 0.45030639320611954, "rewards/accuracy_reward": 0.734693855047226, "rewards/improved_len_reward_dast": 0.3432403616607189, "step": 91 }, { "completion_length": 2126.6173095703125, "epoch": 0.23849643551523006, "grad_norm": 0.20706633281686568, "kl": 0.004180908203125, "learning_rate": 9.4889252255655e-07, "loss": 0.0681, "reward": 1.1621150970458984, "reward_std": 0.2173718847334385, "rewards/accuracy_reward": 0.7295918315649033, "rewards/improved_len_reward_dast": 0.43252328783273697, "step": 92 }, { "completion_length": 2107.4692993164062, "epoch": 0.24108878807517822, "grad_norm": 0.18999527082233988, "kl": 0.00507354736328125, "learning_rate": 9.469848846422223e-07, "loss": 0.0305, "reward": 0.9012731686234474, "reward_std": 0.2958849798887968, "rewards/accuracy_reward": 0.6326530501246452, "rewards/improved_len_reward_dast": 0.2686200775206089, "step": 93 }, { "completion_length": 2329.5662841796875, "epoch": 0.2436811406351264, "grad_norm": 0.17793830796024995, "kl": 0.004726409912109375, "learning_rate": 9.450445187980699e-07, "loss": 0.0053, "reward": 1.0069625079631805, "reward_std": 0.4442039094865322, "rewards/accuracy_reward": 0.663265272974968, "rewards/improved_len_reward_dast": 0.3436972051858902, "step": 94 }, { "completion_length": 2371.1223754882812, "epoch": 0.24627349319507452, "grad_norm": 0.16551461901403783, "kl": 0.00560760498046875, "learning_rate": 9.430715849902774e-07, "loss": 0.0161, "reward": 1.1833973824977875, "reward_std": 0.3829594776034355, "rewards/accuracy_reward": 0.7551020309329033, "rewards/improved_len_reward_dast": 0.4282953441143036, "step": 95 }, { "completion_length": 1950.9897766113281, "epoch": 0.24886584575502269, "grad_norm": 0.22225719247681372, "kl": 0.004608154296875, "learning_rate": 9.410662458699723e-07, "loss": 0.0456, "reward": 1.138383835554123, "reward_std": 0.32722293585538864, "rewards/accuracy_reward": 0.7142857015132904, "rewards/improved_len_reward_dast": 0.4240981712937355, "step": 96 }, { "completion_length": 1459.1683349609375, "epoch": 0.25145819831497085, "grad_norm": 0.20670520181853694, "kl": 0.00476837158203125, "learning_rate": 9.390286667598169e-07, "loss": 0.0546, "reward": 1.3123253285884857, "reward_std": 0.31760613806545734, "rewards/accuracy_reward": 0.846938744187355, "rewards/improved_len_reward_dast": 0.4653865396976471, "step": 97 }, { "completion_length": 1836.9029846191406, "epoch": 0.254050550874919, "grad_norm": 0.20386220038181252, "kl": 0.00446319580078125, "learning_rate": 9.369590156403784e-07, "loss": 0.0339, "reward": 1.3093420267105103, "reward_std": 0.42256173491477966, "rewards/accuracy_reward": 0.8163265138864517, "rewards/improved_len_reward_dast": 0.49301546812057495, "step": 98 }, { "completion_length": 1921.7550354003906, "epoch": 0.2566429034348671, "grad_norm": 0.22385072499443348, "kl": 0.00586700439453125, "learning_rate": 9.348574631362808e-07, "loss": 0.0254, "reward": 1.369395136833191, "reward_std": 0.292521633207798, "rewards/accuracy_reward": 0.8367346823215485, "rewards/improved_len_reward_dast": 0.5326604098081589, "step": 99 }, { "completion_length": 1589.2550659179688, "epoch": 0.2592352559948153, "grad_norm": 0.23062182502361955, "kl": 0.003963470458984375, "learning_rate": 9.327241825021379e-07, "loss": 0.0939, "reward": 1.398920476436615, "reward_std": 0.34097858518362045, "rewards/accuracy_reward": 0.8979591578245163, "rewards/improved_len_reward_dast": 0.5009612441062927, "step": 100 }, { "completion_length": 1968.3979187011719, "epoch": 0.26182760855476345, "grad_norm": 0.19172453408443837, "kl": 0.0052337646484375, "learning_rate": 9.3055934960827e-07, "loss": 0.033, "reward": 1.2349633574485779, "reward_std": 0.4557712897658348, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.46455518156290054, "step": 101 }, { "completion_length": 2024.6580810546875, "epoch": 0.2644199611147116, "grad_norm": 0.18835419471758258, "kl": 0.00595855712890625, "learning_rate": 9.283631429262053e-07, "loss": -0.0018, "reward": 1.237942174077034, "reward_std": 0.4386955201625824, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.4522278979420662, "step": 102 }, { "completion_length": 2042.0101623535156, "epoch": 0.2670123136746598, "grad_norm": 0.16797444756904736, "kl": 0.00687408447265625, "learning_rate": 9.261357435139665e-07, "loss": 0.0127, "reward": 1.147979348897934, "reward_std": 0.39860222302377224, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.3877752497792244, "step": 103 }, { "completion_length": 1771.6785278320312, "epoch": 0.2696046662346079, "grad_norm": 0.19397130084636785, "kl": 0.00556182861328125, "learning_rate": 9.238773350011437e-07, "loss": 0.0329, "reward": 1.3575038313865662, "reward_std": 0.28452699072659016, "rewards/accuracy_reward": 0.8418367356061935, "rewards/improved_len_reward_dast": 0.5156671032309532, "step": 104 }, { "completion_length": 1984.2295532226562, "epoch": 0.27219701879455604, "grad_norm": 0.20491481745891912, "kl": 0.00533294677734375, "learning_rate": 9.215881035737557e-07, "loss": 0.0756, "reward": 1.3917469382286072, "reward_std": 0.3919885456562042, "rewards/accuracy_reward": 0.8673469126224518, "rewards/improved_len_reward_dast": 0.5244000777602196, "step": 105 }, { "completion_length": 2123.3570861816406, "epoch": 0.2747893713545042, "grad_norm": 0.19107859298960242, "kl": 0.00609588623046875, "learning_rate": 9.192682379589017e-07, "loss": 0.0343, "reward": 1.3419382572174072, "reward_std": 0.550883948802948, "rewards/accuracy_reward": 0.8163265287876129, "rewards/improved_len_reward_dast": 0.5256116688251495, "step": 106 }, { "completion_length": 2321.183563232422, "epoch": 0.27738172391445237, "grad_norm": 0.17417279176148165, "kl": 0.00618743896484375, "learning_rate": 9.169179294092006e-07, "loss": 0.037, "reward": 1.2553168833255768, "reward_std": 0.3132058009505272, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.49001070857048035, "step": 107 }, { "completion_length": 1755.6121826171875, "epoch": 0.27997407647440054, "grad_norm": 0.1910812285243796, "kl": 0.0055389404296875, "learning_rate": 9.145373716870257e-07, "loss": 0.0074, "reward": 1.1911440938711166, "reward_std": 0.47732261940836906, "rewards/accuracy_reward": 0.8265305906534195, "rewards/improved_len_reward_dast": 0.36461350694298744, "step": 108 }, { "completion_length": 2498.53564453125, "epoch": 0.2825664290343487, "grad_norm": 0.1847398357059974, "kl": 0.0076904296875, "learning_rate": 9.121267610485294e-07, "loss": 0.0136, "reward": 1.0379046350717545, "reward_std": 0.5191724747419357, "rewards/accuracy_reward": 0.6734693795442581, "rewards/improved_len_reward_dast": 0.36443524062633514, "step": 109 }, { "completion_length": 1881.5408020019531, "epoch": 0.2851587815942968, "grad_norm": 0.1895141382280174, "kl": 0.0063629150390625, "learning_rate": 9.096862962274642e-07, "loss": -0.0114, "reward": 1.2222436666488647, "reward_std": 0.2921589985489845, "rewards/accuracy_reward": 0.760204054415226, "rewards/improved_len_reward_dast": 0.4620395749807358, "step": 110 }, { "completion_length": 2229.341827392578, "epoch": 0.28775113415424497, "grad_norm": 0.16533064618080134, "kl": 0.00737762451171875, "learning_rate": 9.072161784187988e-07, "loss": 0.029, "reward": 1.213012382388115, "reward_std": 0.427090298384428, "rewards/accuracy_reward": 0.795918345451355, "rewards/improved_len_reward_dast": 0.41709401085972786, "step": 111 }, { "completion_length": 1740.8673400878906, "epoch": 0.29034348671419313, "grad_norm": 0.17704874550004857, "kl": 0.00606536865234375, "learning_rate": 9.047166112621312e-07, "loss": 0.0232, "reward": 1.3144700229167938, "reward_std": 0.3366679251194, "rewards/accuracy_reward": 0.8163264989852905, "rewards/improved_len_reward_dast": 0.4981435164809227, "step": 112 }, { "completion_length": 2048.397918701172, "epoch": 0.2929358392741413, "grad_norm": 0.19568646749424262, "kl": 0.00690460205078125, "learning_rate": 9.021878008249001e-07, "loss": 0.0206, "reward": 1.1744825094938278, "reward_std": 0.479649193584919, "rewards/accuracy_reward": 0.7806122303009033, "rewards/improved_len_reward_dast": 0.3938702493906021, "step": 113 }, { "completion_length": 1883.0255126953125, "epoch": 0.29552819183408946, "grad_norm": 0.201863471118327, "kl": 0.007293701171875, "learning_rate": 8.996299555853973e-07, "loss": 0.0263, "reward": 1.3593637347221375, "reward_std": 0.3963543549180031, "rewards/accuracy_reward": 0.8418367207050323, "rewards/improved_len_reward_dast": 0.5175270512700081, "step": 114 }, { "completion_length": 1779.4489135742188, "epoch": 0.29812054439403757, "grad_norm": 0.21073286141952957, "kl": 0.00705718994140625, "learning_rate": 8.970432864155798e-07, "loss": 0.059, "reward": 1.284899353981018, "reward_std": 0.3950739651918411, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.49408305436372757, "step": 115 }, { "completion_length": 1918.2244873046875, "epoch": 0.30071289695398573, "grad_norm": 0.19227538961602422, "kl": 0.00742340087890625, "learning_rate": 8.944280065636851e-07, "loss": 0.0454, "reward": 1.2475728243589401, "reward_std": 0.32171259075403214, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.4618585482239723, "step": 116 }, { "completion_length": 1858.4795532226562, "epoch": 0.3033052495139339, "grad_norm": 0.19238271005304078, "kl": 0.00749969482421875, "learning_rate": 8.917843316366515e-07, "loss": 0.0387, "reward": 1.364868402481079, "reward_std": 0.2818027026951313, "rewards/accuracy_reward": 0.8316326439380646, "rewards/improved_len_reward_dast": 0.533235713839531, "step": 117 }, { "completion_length": 1993.6224060058594, "epoch": 0.30589760207388206, "grad_norm": 0.231864346111992, "kl": 0.00769805908203125, "learning_rate": 8.891124795823426e-07, "loss": -0.0075, "reward": 1.1190623342990875, "reward_std": 0.2991497367620468, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.3282460141927004, "step": 118 }, { "completion_length": 1985.5509643554688, "epoch": 0.3084899546338302, "grad_norm": 0.17623896225871394, "kl": 0.00771331787109375, "learning_rate": 8.864126706715796e-07, "loss": 0.0186, "reward": 1.2160087823867798, "reward_std": 0.35445018485188484, "rewards/accuracy_reward": 0.7448979467153549, "rewards/improved_len_reward_dast": 0.4711107425391674, "step": 119 }, { "completion_length": 2125.1376953125, "epoch": 0.31108230719377833, "grad_norm": 0.2263640313290784, "kl": 0.0087432861328125, "learning_rate": 8.83685127479982e-07, "loss": 0.0941, "reward": 1.281501442193985, "reward_std": 0.38218285515904427, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.5110933035612106, "step": 120 }, { "completion_length": 1814.5611877441406, "epoch": 0.3136746597537265, "grad_norm": 0.19715675281839773, "kl": 0.007568359375, "learning_rate": 8.809300748696173e-07, "loss": 0.0386, "reward": 1.1133249253034592, "reward_std": 0.3796735033392906, "rewards/accuracy_reward": 0.7295918315649033, "rewards/improved_len_reward_dast": 0.38373304158449173, "step": 121 }, { "completion_length": 2427.4489135742188, "epoch": 0.31626701231367466, "grad_norm": 0.16760355775672944, "kl": 0.00905609130859375, "learning_rate": 8.781477399704652e-07, "loss": 0.0048, "reward": 1.0130163729190826, "reward_std": 0.4051677845418453, "rewards/accuracy_reward": 0.6632652878761292, "rewards/improved_len_reward_dast": 0.349751066416502, "step": 122 }, { "completion_length": 2251.3570861816406, "epoch": 0.3188593648736228, "grad_norm": 0.1882544168870131, "kl": 0.00846099853515625, "learning_rate": 8.753383521616902e-07, "loss": 0.0008, "reward": 1.1944599151611328, "reward_std": 0.4080551564693451, "rewards/accuracy_reward": 0.7499999850988388, "rewards/improved_len_reward_dast": 0.4444599226117134, "step": 123 }, { "completion_length": 1852.142822265625, "epoch": 0.321451717433571, "grad_norm": 0.22567456549295617, "kl": 0.007122039794921875, "learning_rate": 8.72502143052733e-07, "loss": 0.0421, "reward": 1.0371171534061432, "reward_std": 0.4070936441421509, "rewards/accuracy_reward": 0.6887754946947098, "rewards/improved_len_reward_dast": 0.34834159165620804, "step": 124 }, { "completion_length": 1902.4897766113281, "epoch": 0.32404406999351915, "grad_norm": 0.18976500768952323, "kl": 0.00728607177734375, "learning_rate": 8.696393464642158e-07, "loss": -0.0168, "reward": 1.379349261522293, "reward_std": 0.34975893795490265, "rewards/accuracy_reward": 0.8469387590885162, "rewards/improved_len_reward_dast": 0.5324105769395828, "step": 125 }, { "completion_length": 1687.3979187011719, "epoch": 0.32663642255346725, "grad_norm": 0.1842833719422884, "kl": 0.00609588623046875, "learning_rate": 8.667501984086655e-07, "loss": 0.0248, "reward": 1.3401367366313934, "reward_std": 0.26001402735710144, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.5544224381446838, "step": 126 }, { "completion_length": 1719.23974609375, "epoch": 0.3292287751134154, "grad_norm": 0.2122526031093734, "kl": 0.00665283203125, "learning_rate": 8.638349370710573e-07, "loss": 0.0493, "reward": 1.2587095499038696, "reward_std": 0.30533889308571815, "rewards/accuracy_reward": 0.8163264989852905, "rewards/improved_len_reward_dast": 0.4423830099403858, "step": 127 }, { "completion_length": 1702.78564453125, "epoch": 0.3318211276733636, "grad_norm": 0.18811783070011717, "kl": 0.00623321533203125, "learning_rate": 8.608938027891775e-07, "loss": 0.0049, "reward": 1.3044427931308746, "reward_std": 0.47574885934591293, "rewards/accuracy_reward": 0.806122437119484, "rewards/improved_len_reward_dast": 0.49832039326429367, "step": 128 }, { "completion_length": 1589.6376953125, "epoch": 0.33441348023331174, "grad_norm": 0.2122723729405287, "kl": 0.007274627685546875, "learning_rate": 8.579270380338107e-07, "loss": 0.0378, "reward": 1.3573221862316132, "reward_std": 0.40166376531124115, "rewards/accuracy_reward": 0.8469387590885162, "rewards/improved_len_reward_dast": 0.510383352637291, "step": 129 }, { "completion_length": 2209.2244873046875, "epoch": 0.3370058327932599, "grad_norm": 0.18766107651382932, "kl": 0.0082550048828125, "learning_rate": 8.549348873887496e-07, "loss": -0.035, "reward": 0.9989715814590454, "reward_std": 0.4630734659731388, "rewards/accuracy_reward": 0.6734693646430969, "rewards/improved_len_reward_dast": 0.32550226897001266, "step": 130 }, { "completion_length": 1750.2499694824219, "epoch": 0.339598185353208, "grad_norm": 0.26668844455154506, "kl": 0.0062713623046875, "learning_rate": 8.519175975306312e-07, "loss": 0.0733, "reward": 1.0193718448281288, "reward_std": 0.49021392315626144, "rewards/accuracy_reward": 0.6989795863628387, "rewards/improved_len_reward_dast": 0.3203922025859356, "step": 131 }, { "completion_length": 1834.892822265625, "epoch": 0.3421905379131562, "grad_norm": 0.17123158557193757, "kl": 0.006275177001953125, "learning_rate": 8.48875417208601e-07, "loss": 0.0191, "reward": 1.2724904865026474, "reward_std": 0.36864253878593445, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.5020823329687119, "step": 132 }, { "completion_length": 1844.9081115722656, "epoch": 0.34478289047310434, "grad_norm": 0.1744110793812119, "kl": 0.00693511962890625, "learning_rate": 8.458085972238048e-07, "loss": 0.0332, "reward": 1.0728662610054016, "reward_std": 0.4644254148006439, "rewards/accuracy_reward": 0.7499999850988388, "rewards/improved_len_reward_dast": 0.3228662498295307, "step": 133 }, { "completion_length": 1910.1427917480469, "epoch": 0.3473752430330525, "grad_norm": 0.22282630764089068, "kl": 0.0084686279296875, "learning_rate": 8.427173904087138e-07, "loss": 0.0291, "reward": 1.1172972619533539, "reward_std": 0.3814988359808922, "rewards/accuracy_reward": 0.7551020085811615, "rewards/improved_len_reward_dast": 0.36219523288309574, "step": 134 }, { "completion_length": 2461.3775329589844, "epoch": 0.34996759559300067, "grad_norm": 0.1595488734110434, "kl": 0.0104522705078125, "learning_rate": 8.396020516062794e-07, "loss": -0.0068, "reward": 0.9715078249573708, "reward_std": 0.3740999586880207, "rewards/accuracy_reward": 0.6173469200730324, "rewards/improved_len_reward_dast": 0.3541608899831772, "step": 135 }, { "completion_length": 1467.096908569336, "epoch": 0.3525599481529488, "grad_norm": 0.17905275908990426, "kl": 0.005458831787109375, "learning_rate": 8.364628376489242e-07, "loss": 0.0333, "reward": 1.558873325586319, "reward_std": 0.29448162391781807, "rewards/accuracy_reward": 0.928571417927742, "rewards/improved_len_reward_dast": 0.6303019374608994, "step": 136 }, { "completion_length": 1310.5, "epoch": 0.35515230071289694, "grad_norm": 0.20951329036509847, "kl": 0.0060577392578125, "learning_rate": 8.333000073373685e-07, "loss": -0.0166, "reward": 1.2859368920326233, "reward_std": 0.3338315784931183, "rewards/accuracy_reward": 0.8061224520206451, "rewards/improved_len_reward_dast": 0.47981445118784904, "step": 137 }, { "completion_length": 1815.6122436523438, "epoch": 0.3577446532728451, "grad_norm": 0.19604752185803775, "kl": 0.0070953369140625, "learning_rate": 8.301138214192945e-07, "loss": 0.0433, "reward": 1.2342120856046677, "reward_std": 0.4501468688249588, "rewards/accuracy_reward": 0.8010203987360001, "rewards/improved_len_reward_dast": 0.4331916607916355, "step": 138 }, { "completion_length": 1862.0764770507812, "epoch": 0.36033700583279327, "grad_norm": 0.18709921475186367, "kl": 0.0084228515625, "learning_rate": 8.269045425678497e-07, "loss": -0.011, "reward": 1.2167351096868515, "reward_std": 0.3770736940205097, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.45653103291988373, "step": 139 }, { "completion_length": 1736.1376953125, "epoch": 0.36292935839274143, "grad_norm": 0.19354018571685683, "kl": 0.0071258544921875, "learning_rate": 8.236724353599918e-07, "loss": 0.041, "reward": 1.496632605791092, "reward_std": 0.3335278294980526, "rewards/accuracy_reward": 0.8979591578245163, "rewards/improved_len_reward_dast": 0.5986734926700592, "step": 140 }, { "completion_length": 1628.4183654785156, "epoch": 0.36552171095268954, "grad_norm": 0.16803171468726585, "kl": 0.00705718994140625, "learning_rate": 8.204177662546763e-07, "loss": -0.0198, "reward": 1.2802585661411285, "reward_std": 0.3480174820870161, "rewards/accuracy_reward": 0.8163265138864517, "rewards/improved_len_reward_dast": 0.46393200755119324, "step": 141 }, { "completion_length": 1563.2244567871094, "epoch": 0.3681140635126377, "grad_norm": 0.21830948983629073, "kl": 0.006256103515625, "learning_rate": 8.171408035708906e-07, "loss": 0.0147, "reward": 1.477361023426056, "reward_std": 0.36876992136240005, "rewards/accuracy_reward": 0.8622448742389679, "rewards/improved_len_reward_dast": 0.6151161342859268, "step": 142 }, { "completion_length": 1426.9744567871094, "epoch": 0.37070641607258586, "grad_norm": 0.1829469047156503, "kl": 0.005870819091796875, "learning_rate": 8.138418174655323e-07, "loss": -0.0128, "reward": 1.475436508655548, "reward_std": 0.28024090081453323, "rewards/accuracy_reward": 0.8877550959587097, "rewards/improved_len_reward_dast": 0.5876814350485802, "step": 143 }, { "completion_length": 2269.73974609375, "epoch": 0.37329876863253403, "grad_norm": 0.15370768982629232, "kl": 0.00823974609375, "learning_rate": 8.105210799111366e-07, "loss": 0.029, "reward": 1.0333527326583862, "reward_std": 0.4238397367298603, "rewards/accuracy_reward": 0.6632652878761292, "rewards/improved_len_reward_dast": 0.37008739449083805, "step": 144 }, { "completion_length": 1661.2142333984375, "epoch": 0.3758911211924822, "grad_norm": 0.1756144937263373, "kl": 0.006439208984375, "learning_rate": 8.071788646734564e-07, "loss": 0.0278, "reward": 1.297868698835373, "reward_std": 0.30791742727160454, "rewards/accuracy_reward": 0.8163265138864517, "rewards/improved_len_reward_dast": 0.4815421551465988, "step": 145 }, { "completion_length": 1629.2754516601562, "epoch": 0.37848347375243035, "grad_norm": 0.19753853796416515, "kl": 0.006805419921875, "learning_rate": 8.038154472888909e-07, "loss": -0.0047, "reward": 1.2643596529960632, "reward_std": 0.403556901961565, "rewards/accuracy_reward": 0.806122437119484, "rewards/improved_len_reward_dast": 0.45823724940419197, "step": 146 }, { "completion_length": 1698.1785278320312, "epoch": 0.38107582631237846, "grad_norm": 0.18090958864036752, "kl": 0.00759124755859375, "learning_rate": 8.004311050417711e-07, "loss": -0.0063, "reward": 1.2380123734474182, "reward_std": 0.39292842149734497, "rewards/accuracy_reward": 0.7806122153997421, "rewards/improved_len_reward_dast": 0.4574001543223858, "step": 147 }, { "completion_length": 1603.7703704833984, "epoch": 0.3836681788723266, "grad_norm": 0.1689548990240542, "kl": 0.00655364990234375, "learning_rate": 7.970261169414999e-07, "loss": 0.0034, "reward": 1.2632354497909546, "reward_std": 0.42876998893916607, "rewards/accuracy_reward": 0.8010203838348389, "rewards/improved_len_reward_dast": 0.46221502870321274, "step": 148 }, { "completion_length": 2111.928497314453, "epoch": 0.3862605314322748, "grad_norm": 0.23403462014206552, "kl": 0.00902557373046875, "learning_rate": 7.936007636995497e-07, "loss": 0.0581, "reward": 1.1535758823156357, "reward_std": 0.33541079610586166, "rewards/accuracy_reward": 0.7091836556792259, "rewards/improved_len_reward_dast": 0.44439224898815155, "step": 149 }, { "completion_length": 1584.5560760498047, "epoch": 0.38885288399222295, "grad_norm": 0.19966714442908384, "kl": 0.00608062744140625, "learning_rate": 7.901553277063213e-07, "loss": -0.0136, "reward": 1.0925945341587067, "reward_std": 0.4660287909209728, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.3323905020952225, "step": 150 }, { "completion_length": 1963.030502319336, "epoch": 0.3914452365521711, "grad_norm": 0.17996728024183786, "kl": 0.0086822509765625, "learning_rate": 7.866900930078618e-07, "loss": 0.0058, "reward": 1.245696559548378, "reward_std": 0.4446266293525696, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.4854924902319908, "step": 151 }, { "completion_length": 1893.0254821777344, "epoch": 0.3940375891121192, "grad_norm": 0.16735022993158205, "kl": 0.007110595703125, "learning_rate": 7.832053452824489e-07, "loss": 0.0104, "reward": 1.2418105602264404, "reward_std": 0.4090575650334358, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.4714023545384407, "step": 152 }, { "completion_length": 1724.3111572265625, "epoch": 0.3966299416720674, "grad_norm": 0.1864010620729168, "kl": 0.00872802734375, "learning_rate": 7.797013718170384e-07, "loss": 0.0296, "reward": 1.1897482573986053, "reward_std": 0.3867075741291046, "rewards/accuracy_reward": 0.7755101919174194, "rewards/improved_len_reward_dast": 0.4142380841076374, "step": 153 }, { "completion_length": 1520.3673553466797, "epoch": 0.39922229423201555, "grad_norm": 0.19558753420229233, "kl": 0.006317138671875, "learning_rate": 7.761784614835801e-07, "loss": -0.0009, "reward": 1.1826948821544647, "reward_std": 0.44549785554409027, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.3969806134700775, "step": 154 }, { "completion_length": 1902.83154296875, "epoch": 0.4018146467919637, "grad_norm": 0.1628442801355898, "kl": 0.007907867431640625, "learning_rate": 7.726369047152029e-07, "loss": 0.0111, "reward": 1.1829434633255005, "reward_std": 0.4352233223617077, "rewards/accuracy_reward": 0.7346938699483871, "rewards/improved_len_reward_dast": 0.44824954867362976, "step": 155 }, { "completion_length": 1687.5867004394531, "epoch": 0.4044069993519119, "grad_norm": 0.15254799874290897, "kl": 0.0055694580078125, "learning_rate": 7.690769934822712e-07, "loss": 0.0209, "reward": 1.3427188694477081, "reward_std": 0.39824075251817703, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.5212903171777725, "step": 156 }, { "completion_length": 1699.2857055664062, "epoch": 0.40699935191186, "grad_norm": 0.17162045711276386, "kl": 0.00756072998046875, "learning_rate": 7.654990212683142e-07, "loss": 0.0029, "reward": 1.3672717213630676, "reward_std": 0.34800875186920166, "rewards/accuracy_reward": 0.8520407974720001, "rewards/improved_len_reward_dast": 0.5152308940887451, "step": 157 }, { "completion_length": 1642.4897766113281, "epoch": 0.40959170447180815, "grad_norm": 0.17781118941038052, "kl": 0.0069427490234375, "learning_rate": 7.619032830458307e-07, "loss": 0.0238, "reward": 1.36138716340065, "reward_std": 0.42799485474824905, "rewards/accuracy_reward": 0.8520407974720001, "rewards/improved_len_reward_dast": 0.5093463957309723, "step": 158 }, { "completion_length": 2058.10205078125, "epoch": 0.4121840570317563, "grad_norm": 0.21486100887413462, "kl": 0.00844573974609375, "learning_rate": 7.582900752519723e-07, "loss": 0.052, "reward": 1.2367046475410461, "reward_std": 0.4686100408434868, "rewards/accuracy_reward": 0.7857142835855484, "rewards/improved_len_reward_dast": 0.45099035650491714, "step": 159 }, { "completion_length": 2116.7601928710938, "epoch": 0.4147764095917045, "grad_norm": 0.21872883985010524, "kl": 0.00928497314453125, "learning_rate": 7.546596957641031e-07, "loss": 0.0469, "reward": 1.1451009958982468, "reward_std": 0.2814931422472, "rewards/accuracy_reward": 0.7244897782802582, "rewards/improved_len_reward_dast": 0.4206111915409565, "step": 160 }, { "completion_length": 2057.1172790527344, "epoch": 0.41736876215165264, "grad_norm": 0.223277485058984, "kl": 0.0099639892578125, "learning_rate": 7.510124438752432e-07, "loss": 0.0282, "reward": 1.2358856201171875, "reward_std": 0.42381204664707184, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.4501713886857033, "step": 161 }, { "completion_length": 1648.7907409667969, "epoch": 0.4199611147116008, "grad_norm": 0.19361427922643096, "kl": 0.007965087890625, "learning_rate": 7.473486202693949e-07, "loss": 0.0283, "reward": 1.5626276433467865, "reward_std": 0.33783891052007675, "rewards/accuracy_reward": 0.9081632643938065, "rewards/improved_len_reward_dast": 0.6544643938541412, "step": 162 }, { "completion_length": 1720.7805938720703, "epoch": 0.4225534672715489, "grad_norm": 0.22042630118078563, "kl": 0.008636474609375, "learning_rate": 7.43668526996753e-07, "loss": 0.0517, "reward": 1.203346148133278, "reward_std": 0.48596539348363876, "rewards/accuracy_reward": 0.7704081386327744, "rewards/improved_len_reward_dast": 0.43293796479701996, "step": 163 }, { "completion_length": 1918.5816345214844, "epoch": 0.4251458198314971, "grad_norm": 0.20825217508460148, "kl": 0.0105438232421875, "learning_rate": 7.399724674488046e-07, "loss": 0.0313, "reward": 1.2619640827178955, "reward_std": 0.3394176550209522, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.49665799736976624, "step": 164 }, { "completion_length": 1879.0867004394531, "epoch": 0.42773817239144524, "grad_norm": 0.20859456410748778, "kl": 0.00949859619140625, "learning_rate": 7.36260746333316e-07, "loss": 0.1032, "reward": 1.250516802072525, "reward_std": 0.21495914831757545, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.48521073907613754, "step": 165 }, { "completion_length": 1788.2040405273438, "epoch": 0.4303305249513934, "grad_norm": 0.19365279193672524, "kl": 0.00925445556640625, "learning_rate": 7.325336696492128e-07, "loss": 0.031, "reward": 1.3934488892555237, "reward_std": 0.3679058402776718, "rewards/accuracy_reward": 0.867346927523613, "rewards/improved_len_reward_dast": 0.5261020287871361, "step": 166 }, { "completion_length": 2040.7346801757812, "epoch": 0.43292287751134156, "grad_norm": 0.1746728685861396, "kl": 0.010894775390625, "learning_rate": 7.287915446613531e-07, "loss": 0.0021, "reward": 1.270061433315277, "reward_std": 0.3740099295973778, "rewards/accuracy_reward": 0.8061224222183228, "rewards/improved_len_reward_dast": 0.46393903344869614, "step": 167 }, { "completion_length": 2118.234649658203, "epoch": 0.43551523007128967, "grad_norm": 0.20129074148639173, "kl": 0.013275146484375, "learning_rate": 7.250346798751953e-07, "loss": 0.006, "reward": 0.9839373528957367, "reward_std": 0.581517793238163, "rewards/accuracy_reward": 0.6785714030265808, "rewards/improved_len_reward_dast": 0.3053659498691559, "step": 168 }, { "completion_length": 1795.9540252685547, "epoch": 0.43810758263123784, "grad_norm": 0.1813953032982878, "kl": 0.009395599365234375, "learning_rate": 7.212633850113662e-07, "loss": 0.0235, "reward": 1.178409919142723, "reward_std": 0.4242382049560547, "rewards/accuracy_reward": 0.734693855047226, "rewards/improved_len_reward_dast": 0.44371599704027176, "step": 169 }, { "completion_length": 1421.1734619140625, "epoch": 0.440699935191186, "grad_norm": 0.18794137958282095, "kl": 0.008941650390625, "learning_rate": 7.174779709801253e-07, "loss": 0.0159, "reward": 1.4234746396541595, "reward_std": 0.32885606586933136, "rewards/accuracy_reward": 0.8622448742389679, "rewards/improved_len_reward_dast": 0.5612297654151917, "step": 170 }, { "completion_length": 1736.6632690429688, "epoch": 0.44329228775113416, "grad_norm": 0.22796049151575712, "kl": 0.009891510009765625, "learning_rate": 7.136787498557344e-07, "loss": 0.0088, "reward": 1.3514071702957153, "reward_std": 0.40995020419359207, "rewards/accuracy_reward": 0.846938744187355, "rewards/improved_len_reward_dast": 0.5044683739542961, "step": 171 }, { "completion_length": 1768.7193603515625, "epoch": 0.4458846403110823, "grad_norm": 0.25032479837006205, "kl": 0.010284423828125, "learning_rate": 7.098660348507293e-07, "loss": 0.0732, "reward": 1.269765853881836, "reward_std": 0.46360351890325546, "rewards/accuracy_reward": 0.7704081386327744, "rewards/improved_len_reward_dast": 0.4993576854467392, "step": 172 }, { "completion_length": 1956.9999694824219, "epoch": 0.44847699287103043, "grad_norm": 0.17507117871432235, "kl": 0.0093231201171875, "learning_rate": 7.060401402900977e-07, "loss": 0.0185, "reward": 1.1613440364599228, "reward_std": 0.5052430480718613, "rewards/accuracy_reward": 0.739795908331871, "rewards/improved_len_reward_dast": 0.42154809460043907, "step": 173 }, { "completion_length": 1834.2601623535156, "epoch": 0.4510693454309786, "grad_norm": 0.19217203672529928, "kl": 0.01007843017578125, "learning_rate": 7.022013815853672e-07, "loss": 0.0209, "reward": 1.0959883034229279, "reward_std": 0.47629018872976303, "rewards/accuracy_reward": 0.7295918166637421, "rewards/improved_len_reward_dast": 0.3663964793086052, "step": 174 }, { "completion_length": 1817.4489440917969, "epoch": 0.45366169799092676, "grad_norm": 0.19322905501288215, "kl": 0.01153564453125, "learning_rate": 6.983500752086006e-07, "loss": 0.0448, "reward": 1.2833284437656403, "reward_std": 0.43457718193531036, "rewards/accuracy_reward": 0.795918345451355, "rewards/improved_len_reward_dast": 0.4874100536108017, "step": 175 }, { "completion_length": 1651.7244873046875, "epoch": 0.4562540505508749, "grad_norm": 0.19443121591302054, "kl": 0.00969696044921875, "learning_rate": 6.94486538666307e-07, "loss": 0.0327, "reward": 1.254166454076767, "reward_std": 0.4054510071873665, "rewards/accuracy_reward": 0.7806122452020645, "rewards/improved_len_reward_dast": 0.47355421632528305, "step": 176 }, { "completion_length": 1690.4234313964844, "epoch": 0.4588464031108231, "grad_norm": 0.2099852909442493, "kl": 0.0092010498046875, "learning_rate": 6.906110904732656e-07, "loss": -0.0115, "reward": 1.3241359293460846, "reward_std": 0.4749620705842972, "rewards/accuracy_reward": 0.8163265138864517, "rewards/improved_len_reward_dast": 0.5078093633055687, "step": 177 }, { "completion_length": 2150.1529541015625, "epoch": 0.46143875567077125, "grad_norm": 0.16262254100217993, "kl": 0.01073455810546875, "learning_rate": 6.867240501262666e-07, "loss": 0.0219, "reward": 1.3224327564239502, "reward_std": 0.31201132386922836, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.5622286796569824, "step": 178 }, { "completion_length": 1616.73974609375, "epoch": 0.46403110823071936, "grad_norm": 0.2054857790671321, "kl": 0.010406494140625, "learning_rate": 6.828257380777723e-07, "loss": -0.0028, "reward": 1.2023987025022507, "reward_std": 0.38464218378067017, "rewards/accuracy_reward": 0.8214285671710968, "rewards/improved_len_reward_dast": 0.38097016140818596, "step": 179 }, { "completion_length": 1939.9744567871094, "epoch": 0.4666234607906675, "grad_norm": 0.18969129476831767, "kl": 0.0137481689453125, "learning_rate": 6.789164757094978e-07, "loss": 0.035, "reward": 1.1967380195856094, "reward_std": 0.3427240923047066, "rewards/accuracy_reward": 0.734693855047226, "rewards/improved_len_reward_dast": 0.4620441570878029, "step": 180 }, { "completion_length": 1848.25, "epoch": 0.4692158133506157, "grad_norm": 0.18668896975291646, "kl": 0.011810302734375, "learning_rate": 6.749965853059164e-07, "loss": 0.0536, "reward": 1.3282198309898376, "reward_std": 0.4290488064289093, "rewards/accuracy_reward": 0.8520407974720001, "rewards/improved_len_reward_dast": 0.47617900371551514, "step": 181 }, { "completion_length": 1659.9489440917969, "epoch": 0.47180816591056385, "grad_norm": 0.2068391235436955, "kl": 0.0099334716796875, "learning_rate": 6.710663900276903e-07, "loss": 0.0149, "reward": 1.1044558137655258, "reward_std": 0.389005184173584, "rewards/accuracy_reward": 0.7244897931814194, "rewards/improved_len_reward_dast": 0.37996600940823555, "step": 182 }, { "completion_length": 1548.0152893066406, "epoch": 0.474400518470512, "grad_norm": 0.19942963085334378, "kl": 0.00998687744140625, "learning_rate": 6.671262138850274e-07, "loss": 0.0277, "reward": 1.4036801755428314, "reward_std": 0.325181283056736, "rewards/accuracy_reward": 0.846938744187355, "rewards/improved_len_reward_dast": 0.5567413941025734, "step": 183 }, { "completion_length": 1479.9234619140625, "epoch": 0.4769928710304601, "grad_norm": 0.17528837750916904, "kl": 0.00907135009765625, "learning_rate": 6.631763817109717e-07, "loss": 0.0212, "reward": 1.4963186979293823, "reward_std": 0.2380654364824295, "rewards/accuracy_reward": 0.8826530426740646, "rewards/improved_len_reward_dast": 0.6136656627058983, "step": 184 }, { "completion_length": 1625.2856750488281, "epoch": 0.4795852235904083, "grad_norm": 0.2340295745334256, "kl": 0.00994873046875, "learning_rate": 6.592172191346218e-07, "loss": 0.0387, "reward": 1.3299905359745026, "reward_std": 0.4121420457959175, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.5085620209574699, "step": 185 }, { "completion_length": 1799.586669921875, "epoch": 0.48217757615035645, "grad_norm": 0.208310701570096, "kl": 0.012359619140625, "learning_rate": 6.552490525542864e-07, "loss": 0.0341, "reward": 1.2161507308483124, "reward_std": 0.3565462492406368, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.4559466913342476, "step": 186 }, { "completion_length": 1612.836685180664, "epoch": 0.4847699287103046, "grad_norm": 0.1767048426760215, "kl": 0.0106048583984375, "learning_rate": 6.512722091105757e-07, "loss": -0.0013, "reward": 1.3248589038848877, "reward_std": 0.45474397391080856, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.5136343911290169, "step": 187 }, { "completion_length": 1306.5509796142578, "epoch": 0.4873622812702528, "grad_norm": 0.212241902185087, "kl": 0.00981903076171875, "learning_rate": 6.472870166594314e-07, "loss": 0.0047, "reward": 1.4141908586025238, "reward_std": 0.4169772267341614, "rewards/accuracy_reward": 0.8418367058038712, "rewards/improved_len_reward_dast": 0.5723541006445885, "step": 188 }, { "completion_length": 1914.642822265625, "epoch": 0.4899546338302009, "grad_norm": 0.2520686184939368, "kl": 0.0127410888671875, "learning_rate": 6.432938037450974e-07, "loss": -0.0237, "reward": 1.1971821933984756, "reward_std": 0.3514118604362011, "rewards/accuracy_reward": 0.7499999850988388, "rewards/improved_len_reward_dast": 0.44718217849731445, "step": 189 }, { "completion_length": 1808.9183349609375, "epoch": 0.49254698639014904, "grad_norm": 0.2130749709969565, "kl": 0.01201629638671875, "learning_rate": 6.392928995730352e-07, "loss": 0.0412, "reward": 1.2710473388433456, "reward_std": 0.3865230418741703, "rewards/accuracy_reward": 0.7908163219690323, "rewards/improved_len_reward_dast": 0.48023101314902306, "step": 190 }, { "completion_length": 1365.4795837402344, "epoch": 0.4951393389500972, "grad_norm": 0.250237755024117, "kl": 0.00952911376953125, "learning_rate": 6.352846339827826e-07, "loss": 0.095, "reward": 1.5109961926937103, "reward_std": 0.30784352123737335, "rewards/accuracy_reward": 0.9132653027772903, "rewards/improved_len_reward_dast": 0.5977308824658394, "step": 191 }, { "completion_length": 1425.2755126953125, "epoch": 0.49773169151004537, "grad_norm": 0.22368363257945995, "kl": 0.0114288330078125, "learning_rate": 6.312693374207627e-07, "loss": 0.0195, "reward": 1.2838004529476166, "reward_std": 0.46850764751434326, "rewards/accuracy_reward": 0.8265306055545807, "rewards/improved_len_reward_dast": 0.4572698399424553, "step": 192 }, { "completion_length": 1588.5101623535156, "epoch": 0.5003240440699935, "grad_norm": 0.20204139731047027, "kl": 0.01300048828125, "learning_rate": 6.272473409130397e-07, "loss": 0.0012, "reward": 1.3159003108739853, "reward_std": 0.4093224108219147, "rewards/accuracy_reward": 0.8316326439380646, "rewards/improved_len_reward_dast": 0.484267670661211, "step": 193 }, { "completion_length": 1411.3571166992188, "epoch": 0.5029163966299417, "grad_norm": 0.19443397701968118, "kl": 0.00821685791015625, "learning_rate": 6.232189760380301e-07, "loss": 0.0224, "reward": 1.288124531507492, "reward_std": 0.3209230378270149, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.5024102553725243, "step": 194 }, { "completion_length": 1751.6785278320312, "epoch": 0.5055087491898899, "grad_norm": 0.18304814418314927, "kl": 0.0109100341796875, "learning_rate": 6.191845748991671e-07, "loss": -0.007, "reward": 1.0736610293388367, "reward_std": 0.32857421785593033, "rewards/accuracy_reward": 0.6581632494926453, "rewards/improved_len_reward_dast": 0.41549770161509514, "step": 195 }, { "completion_length": 1771.5968933105469, "epoch": 0.508101101749838, "grad_norm": 0.20612952277089522, "kl": 0.0137939453125, "learning_rate": 6.151444700975203e-07, "loss": 0.0106, "reward": 1.360820233821869, "reward_std": 0.38221075385808945, "rewards/accuracy_reward": 0.8418367207050323, "rewards/improved_len_reward_dast": 0.518983505666256, "step": 196 }, { "completion_length": 2076.3060913085938, "epoch": 0.5106934543097861, "grad_norm": 0.22320859434163112, "kl": 0.0132293701171875, "learning_rate": 6.110989947043767e-07, "loss": 0.0519, "reward": 1.101119041442871, "reward_std": 0.4651700109243393, "rewards/accuracy_reward": 0.7244897931814194, "rewards/improved_len_reward_dast": 0.37662921100854874, "step": 197 }, { "completion_length": 1513.6530151367188, "epoch": 0.5132858068697342, "grad_norm": 0.24160481879222073, "kl": 0.0120849609375, "learning_rate": 6.070484822337816e-07, "loss": 0.0617, "reward": 1.3807711601257324, "reward_std": 0.30266276001930237, "rewards/accuracy_reward": 0.8622448742389679, "rewards/improved_len_reward_dast": 0.5185262858867645, "step": 198 }, { "completion_length": 1659.4744262695312, "epoch": 0.5158781594296824, "grad_norm": 0.2860111752617934, "kl": 0.0122528076171875, "learning_rate": 6.029932666150431e-07, "loss": 0.0487, "reward": 1.27889584004879, "reward_std": 0.40974466502666473, "rewards/accuracy_reward": 0.8010203987360001, "rewards/improved_len_reward_dast": 0.4778754487633705, "step": 199 }, { "completion_length": 1553.6479187011719, "epoch": 0.5184705119896306, "grad_norm": 0.17284042761570728, "kl": 0.0113372802734375, "learning_rate": 5.989336821652029e-07, "loss": -0.0157, "reward": 1.292808324098587, "reward_std": 0.3536081798374653, "rewards/accuracy_reward": 0.7755101919174194, "rewards/improved_len_reward_dast": 0.517298124730587, "step": 200 }, { "completion_length": 1221.6734313964844, "epoch": 0.5210628645495787, "grad_norm": 0.20576387898105802, "kl": 0.00975799560546875, "learning_rate": 5.948700635614745e-07, "loss": 0.0155, "reward": 1.043928012251854, "reward_std": 0.5074506774544716, "rewards/accuracy_reward": 0.734693855047226, "rewards/improved_len_reward_dast": 0.3092341625597328, "step": 201 }, { "completion_length": 1443.3367156982422, "epoch": 0.5236552171095269, "grad_norm": 0.190656293014884, "kl": 0.01007080078125, "learning_rate": 5.908027458136518e-07, "loss": 0.027, "reward": 1.5769412517547607, "reward_std": 0.27542993798851967, "rewards/accuracy_reward": 0.9081632494926453, "rewards/improved_len_reward_dast": 0.6687779873609543, "step": 202 }, { "completion_length": 1383.1325988769531, "epoch": 0.5262475696694751, "grad_norm": 0.18700146403961007, "kl": 0.00789642333984375, "learning_rate": 5.867320642364916e-07, "loss": -0.0, "reward": 1.4069096446037292, "reward_std": 0.452865906059742, "rewards/accuracy_reward": 0.8571428507566452, "rewards/improved_len_reward_dast": 0.5497667863965034, "step": 203 }, { "completion_length": 1636.7448425292969, "epoch": 0.5288399222294232, "grad_norm": 0.18621798443065538, "kl": 0.01001739501953125, "learning_rate": 5.826583544220678e-07, "loss": 0.0023, "reward": 1.1149714589118958, "reward_std": 0.5129830092191696, "rewards/accuracy_reward": 0.739795908331871, "rewards/improved_len_reward_dast": 0.3751755505800247, "step": 204 }, { "completion_length": 1296.4540252685547, "epoch": 0.5314322747893714, "grad_norm": 0.24973009441281563, "kl": 0.00960540771484375, "learning_rate": 5.78581952212107e-07, "loss": 0.057, "reward": 1.439581423997879, "reward_std": 0.20332731679081917, "rewards/accuracy_reward": 0.8775510191917419, "rewards/improved_len_reward_dast": 0.5620303899049759, "step": 205 }, { "completion_length": 1675.2040405273438, "epoch": 0.5340246273493195, "grad_norm": 0.17994542833868402, "kl": 0.0113983154296875, "learning_rate": 5.745031936702997e-07, "loss": 0.0212, "reward": 1.236918032169342, "reward_std": 0.4141309931874275, "rewards/accuracy_reward": 0.7755101919174194, "rewards/improved_len_reward_dast": 0.46140778064727783, "step": 206 }, { "completion_length": 1685.6376953125, "epoch": 0.5366169799092677, "grad_norm": 0.19387833193950482, "kl": 0.0142364501953125, "learning_rate": 5.704224150545956e-07, "loss": 0.0032, "reward": 1.1570499688386917, "reward_std": 0.4146932289004326, "rewards/accuracy_reward": 0.739795908331871, "rewards/improved_len_reward_dast": 0.4172540530562401, "step": 207 }, { "completion_length": 1249.0101928710938, "epoch": 0.5392093324692158, "grad_norm": 0.1923070203823955, "kl": 0.0085906982421875, "learning_rate": 5.663399527894816e-07, "loss": 0.0138, "reward": 1.4272409826517105, "reward_std": 0.34243838489055634, "rewards/accuracy_reward": 0.8622448742389679, "rewards/improved_len_reward_dast": 0.5649960786104202, "step": 208 }, { "completion_length": 1525.1734313964844, "epoch": 0.5418016850291639, "grad_norm": 0.19609225255735566, "kl": 0.01036834716796875, "learning_rate": 5.622561434382467e-07, "loss": 0.0011, "reward": 1.1873522847890854, "reward_std": 0.4918947294354439, "rewards/accuracy_reward": 0.8010203838348389, "rewards/improved_len_reward_dast": 0.386331919580698, "step": 209 }, { "completion_length": 1988.4591064453125, "epoch": 0.5443940375891121, "grad_norm": 0.2322805815292897, "kl": 0.0143280029296875, "learning_rate": 5.581713236752361e-07, "loss": 0.0289, "reward": 1.1922202408313751, "reward_std": 0.2860515546053648, "rewards/accuracy_reward": 0.7244897782802582, "rewards/improved_len_reward_dast": 0.46773041412234306, "step": 210 }, { "completion_length": 1433.290771484375, "epoch": 0.5469863901490603, "grad_norm": 0.2984688713886969, "kl": 0.0114898681640625, "learning_rate": 5.540858302580934e-07, "loss": 0.0818, "reward": 1.3492214977741241, "reward_std": 0.3557019531726837, "rewards/accuracy_reward": 0.8622448742389679, "rewards/improved_len_reward_dast": 0.48697663098573685, "step": 211 }, { "completion_length": 1686.086669921875, "epoch": 0.5495787427090084, "grad_norm": 0.17323504261296585, "kl": 0.01081085205078125, "learning_rate": 5.5e-07, "loss": -0.0227, "reward": 0.910240039229393, "reward_std": 0.49440842866897583, "rewards/accuracy_reward": 0.6632653027772903, "rewards/improved_len_reward_dast": 0.24697477743029594, "step": 212 }, { "completion_length": 1503.3571166992188, "epoch": 0.5521710952689566, "grad_norm": 0.19940687047680583, "kl": 0.0108795166015625, "learning_rate": 5.459141697419066e-07, "loss": 0.0196, "reward": 1.414816826581955, "reward_std": 0.24907327815890312, "rewards/accuracy_reward": 0.8622448742389679, "rewards/improved_len_reward_dast": 0.5525719411671162, "step": 213 }, { "completion_length": 1326.4744720458984, "epoch": 0.5547634478289047, "grad_norm": 0.1968213437884411, "kl": 0.00897216796875, "learning_rate": 5.418286763247641e-07, "loss": 0.0333, "reward": 1.5710687637329102, "reward_std": 0.27853039279580116, "rewards/accuracy_reward": 0.9336734712123871, "rewards/improved_len_reward_dast": 0.6373953074216843, "step": 214 }, { "completion_length": 1814.7856750488281, "epoch": 0.5573558003888529, "grad_norm": 0.1910754560182501, "kl": 0.0157623291015625, "learning_rate": 5.377438565617532e-07, "loss": 0.0053, "reward": 1.1130409240722656, "reward_std": 0.5712603330612183, "rewards/accuracy_reward": 0.7091836780309677, "rewards/improved_len_reward_dast": 0.4038572832942009, "step": 215 }, { "completion_length": 2041.4693603515625, "epoch": 0.5599481529488011, "grad_norm": 0.19528431114703992, "kl": 0.017974853515625, "learning_rate": 5.336600472105186e-07, "loss": 0.0026, "reward": 1.1326239556074142, "reward_std": 0.5115986987948418, "rewards/accuracy_reward": 0.7193877249956131, "rewards/improved_len_reward_dast": 0.41323617100715637, "step": 216 }, { "completion_length": 1490.438720703125, "epoch": 0.5625405055087492, "grad_norm": 0.1818395863982982, "kl": 0.011444091796875, "learning_rate": 5.295775849454045e-07, "loss": -0.025, "reward": 1.1338547468185425, "reward_std": 0.26832524314522743, "rewards/accuracy_reward": 0.75, "rewards/improved_len_reward_dast": 0.3838547393679619, "step": 217 }, { "completion_length": 1993.8571166992188, "epoch": 0.5651328580686974, "grad_norm": 0.23754078779498058, "kl": 0.0171356201171875, "learning_rate": 5.254968063297003e-07, "loss": -0.0245, "reward": 1.088214099407196, "reward_std": 0.33989886194467545, "rewards/accuracy_reward": 0.6938775330781937, "rewards/improved_len_reward_dast": 0.3943365402519703, "step": 218 }, { "completion_length": 1916.8775024414062, "epoch": 0.5677252106286454, "grad_norm": 0.23169329147427764, "kl": 0.0146942138671875, "learning_rate": 5.214180477878931e-07, "loss": -0.0216, "reward": 1.1535532772541046, "reward_std": 0.5523173958063126, "rewards/accuracy_reward": 0.739795908331871, "rewards/improved_len_reward_dast": 0.4137573465704918, "step": 219 }, { "completion_length": 2072.586700439453, "epoch": 0.5703175631885936, "grad_norm": 0.179237513002948, "kl": 0.0157623291015625, "learning_rate": 5.173416455779323e-07, "loss": 0.0061, "reward": 1.129465639591217, "reward_std": 0.47254087403416634, "rewards/accuracy_reward": 0.7397958934307098, "rewards/improved_len_reward_dast": 0.3896697536110878, "step": 220 }, { "completion_length": 1500.7499694824219, "epoch": 0.5729099157485418, "grad_norm": 0.18878843129064268, "kl": 0.01107025146484375, "learning_rate": 5.132679357635086e-07, "loss": -0.0142, "reward": 1.1763963997364044, "reward_std": 0.48718392848968506, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.40598829090595245, "step": 221 }, { "completion_length": 1644.9030151367188, "epoch": 0.5755022683084899, "grad_norm": 0.17742073908553643, "kl": 0.0126495361328125, "learning_rate": 5.091972541863481e-07, "loss": 0.0186, "reward": 1.1986051201820374, "reward_std": 0.4172977935522795, "rewards/accuracy_reward": 0.734693855047226, "rewards/improved_len_reward_dast": 0.463911272585392, "step": 222 }, { "completion_length": 1161.091812133789, "epoch": 0.5780946208684381, "grad_norm": 0.189357723748229, "kl": 0.00917816162109375, "learning_rate": 5.051299364385257e-07, "loss": 0.0034, "reward": 1.5119259655475616, "reward_std": 0.34742674231529236, "rewards/accuracy_reward": 0.9030611962080002, "rewards/improved_len_reward_dast": 0.6088647544384003, "step": 223 }, { "completion_length": 2160.7142944335938, "epoch": 0.5806869734283863, "grad_norm": 0.1958816052872559, "kl": 0.0196075439453125, "learning_rate": 5.010663178347971e-07, "loss": 0.0345, "reward": 1.2357909381389618, "reward_std": 0.4518684595823288, "rewards/accuracy_reward": 0.7448979318141937, "rewards/improved_len_reward_dast": 0.4908929914236069, "step": 224 }, { "completion_length": 1368.7703552246094, "epoch": 0.5832793259883344, "grad_norm": 0.2126816864157868, "kl": 0.01153564453125, "learning_rate": 4.970067333849568e-07, "loss": 0.0421, "reward": 1.3800954520702362, "reward_std": 0.24764511361718178, "rewards/accuracy_reward": 0.8163265287876129, "rewards/improved_len_reward_dast": 0.5637688413262367, "step": 225 }, { "completion_length": 1523.7958984375, "epoch": 0.5858716785482826, "grad_norm": 0.2103498219912096, "kl": 0.013336181640625, "learning_rate": 4.929515177662182e-07, "loss": 0.0336, "reward": 1.3088043332099915, "reward_std": 0.3938099816441536, "rewards/accuracy_reward": 0.8214285671710968, "rewards/improved_len_reward_dast": 0.48737573623657227, "step": 226 }, { "completion_length": 1753.9897766113281, "epoch": 0.5884640311082308, "grad_norm": 0.17623732882686455, "kl": 0.0133514404296875, "learning_rate": 4.889010052956233e-07, "loss": 0.0184, "reward": 1.1956195682287216, "reward_std": 0.38174545764923096, "rewards/accuracy_reward": 0.7551020234823227, "rewards/improved_len_reward_dast": 0.44051752984523773, "step": 227 }, { "completion_length": 1186.4795837402344, "epoch": 0.5910563836681789, "grad_norm": 0.19103765244425439, "kl": 0.00911712646484375, "learning_rate": 4.848555299024798e-07, "loss": -0.0025, "reward": 1.3858640789985657, "reward_std": 0.2998353075236082, "rewards/accuracy_reward": 0.8724489808082581, "rewards/improved_len_reward_dast": 0.5134151205420494, "step": 228 }, { "completion_length": 1717.0713806152344, "epoch": 0.593648736228127, "grad_norm": 0.1787260124676487, "kl": 0.01560211181640625, "learning_rate": 4.80815425100833e-07, "loss": 0.0131, "reward": 1.2940033674240112, "reward_std": 0.3880784399807453, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.5031870305538177, "step": 229 }, { "completion_length": 1570.3979187011719, "epoch": 0.5962410887880751, "grad_norm": 0.1932563584259016, "kl": 0.0125732421875, "learning_rate": 4.7678102396196983e-07, "loss": 0.0028, "reward": 1.194681242108345, "reward_std": 0.36879952996969223, "rewards/accuracy_reward": 0.7704081386327744, "rewards/improved_len_reward_dast": 0.4242731127887964, "step": 230 }, { "completion_length": 1627.1173400878906, "epoch": 0.5988334413480233, "grad_norm": 0.20069193255347081, "kl": 0.01148223876953125, "learning_rate": 4.727526590869605e-07, "loss": -0.0024, "reward": 1.2599404603242874, "reward_std": 0.3717983737587929, "rewards/accuracy_reward": 0.8061224222183228, "rewards/improved_len_reward_dast": 0.45381802320480347, "step": 231 }, { "completion_length": 1422.693832397461, "epoch": 0.6014257939079715, "grad_norm": 0.22397903045763606, "kl": 0.011993408203125, "learning_rate": 4.6873066257923735e-07, "loss": -0.0198, "reward": 1.1824947893619537, "reward_std": 0.3314864858984947, "rewards/accuracy_reward": 0.7806122153997421, "rewards/improved_len_reward_dast": 0.4018825590610504, "step": 232 }, { "completion_length": 2077.2550659179688, "epoch": 0.6040181464679196, "grad_norm": 0.2622807945246562, "kl": 0.0151519775390625, "learning_rate": 4.647153660172173e-07, "loss": 0.0607, "reward": 1.1635594964027405, "reward_std": 0.392416313290596, "rewards/accuracy_reward": 0.7499999701976776, "rewards/improved_len_reward_dast": 0.4135594889521599, "step": 233 }, { "completion_length": 1738.4336547851562, "epoch": 0.6066104990278678, "grad_norm": 0.24814578097643056, "kl": 0.01483917236328125, "learning_rate": 4.607071004269647e-07, "loss": 0.031, "reward": 1.369605004787445, "reward_std": 0.3843038082122803, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.5583804696798325, "step": 234 }, { "completion_length": 1602.0713806152344, "epoch": 0.609202851587816, "grad_norm": 0.2094489678458985, "kl": 0.01458740234375, "learning_rate": 4.567061962549025e-07, "loss": -0.0277, "reward": 1.1768890023231506, "reward_std": 0.5075602382421494, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.4115828797221184, "step": 235 }, { "completion_length": 1883.586669921875, "epoch": 0.6117952041477641, "grad_norm": 0.18539849926073623, "kl": 0.01873779296875, "learning_rate": 4.527129833405687e-07, "loss": 0.0234, "reward": 1.2962508648633957, "reward_std": 0.23112722299993038, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.5309447646141052, "step": 236 }, { "completion_length": 1541.188720703125, "epoch": 0.6143875567077123, "grad_norm": 0.2211580384146908, "kl": 0.013671875, "learning_rate": 4.4872779088942425e-07, "loss": 0.027, "reward": 1.3446270525455475, "reward_std": 0.4020156227052212, "rewards/accuracy_reward": 0.8265305906534195, "rewards/improved_len_reward_dast": 0.5180964693427086, "step": 237 }, { "completion_length": 1877.1122131347656, "epoch": 0.6169799092676604, "grad_norm": 0.27937868976565, "kl": 0.0175018310546875, "learning_rate": 4.447509474457135e-07, "loss": -0.0519, "reward": 1.3078001737594604, "reward_std": 0.3943771682679653, "rewards/accuracy_reward": 0.811224490404129, "rewards/improved_len_reward_dast": 0.49657563865184784, "step": 238 }, { "completion_length": 1735.6836547851562, "epoch": 0.6195722618276086, "grad_norm": 0.19004402096856263, "kl": 0.013519287109375, "learning_rate": 4.4078278086537823e-07, "loss": 0.019, "reward": 1.430199384689331, "reward_std": 0.45470841974020004, "rewards/accuracy_reward": 0.8418367207050323, "rewards/improved_len_reward_dast": 0.5883626788854599, "step": 239 }, { "completion_length": 1290.8877258300781, "epoch": 0.6221646143875567, "grad_norm": 0.20039034607000805, "kl": 0.00916290283203125, "learning_rate": 4.3682361828902846e-07, "loss": 0.0204, "reward": 1.4429042339324951, "reward_std": 0.40230638161301613, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.5857614576816559, "step": 240 }, { "completion_length": 1543.5713958740234, "epoch": 0.6247569669475048, "grad_norm": 0.1796128893155037, "kl": 0.0121002197265625, "learning_rate": 4.328737861149726e-07, "loss": 0.0061, "reward": 1.060480311512947, "reward_std": 0.4090285710990429, "rewards/accuracy_reward": 0.7040816247463226, "rewards/improved_len_reward_dast": 0.35639870166778564, "step": 241 }, { "completion_length": 1650.6581420898438, "epoch": 0.627349319507453, "grad_norm": 0.17035045538288204, "kl": 0.0127410888671875, "learning_rate": 4.289336099723098e-07, "loss": -0.0068, "reward": 1.2868027091026306, "reward_std": 0.4846101552248001, "rewards/accuracy_reward": 0.795918345451355, "rewards/improved_len_reward_dast": 0.49088432639837265, "step": 242 }, { "completion_length": 1806.8724212646484, "epoch": 0.6299416720674011, "grad_norm": 0.21153725027052578, "kl": 0.01531982421875, "learning_rate": 4.250034146940834e-07, "loss": 0.0342, "reward": 1.3773571997880936, "reward_std": 0.32580330967903137, "rewards/accuracy_reward": 0.8265305906534195, "rewards/improved_len_reward_dast": 0.5508265644311905, "step": 243 }, { "completion_length": 1506.8877410888672, "epoch": 0.6325340246273493, "grad_norm": 0.20274200364313702, "kl": 0.01300048828125, "learning_rate": 4.210835242905023e-07, "loss": 0.0114, "reward": 1.3944001197814941, "reward_std": 0.35993905924260616, "rewards/accuracy_reward": 0.867346927523613, "rewards/improved_len_reward_dast": 0.5270532071590424, "step": 244 }, { "completion_length": 1694.5713806152344, "epoch": 0.6351263771872975, "grad_norm": 0.20631633070295144, "kl": 0.01531982421875, "learning_rate": 4.1717426192222784e-07, "loss": 0.001, "reward": 1.269565299153328, "reward_std": 0.3799453191459179, "rewards/accuracy_reward": 0.7908162921667099, "rewards/improved_len_reward_dast": 0.4787489101290703, "step": 245 }, { "completion_length": 2018.9642028808594, "epoch": 0.6377187297472456, "grad_norm": 0.23377044647625822, "kl": 0.01549530029296875, "learning_rate": 4.1327594987373347e-07, "loss": 0.0057, "reward": 0.9710913375020027, "reward_std": 0.4150635525584221, "rewards/accuracy_reward": 0.6479591578245163, "rewards/improved_len_reward_dast": 0.3231321321800351, "step": 246 }, { "completion_length": 1953.44384765625, "epoch": 0.6403110823071938, "grad_norm": 0.18922091960973522, "kl": 0.0152740478515625, "learning_rate": 4.0938890952673443e-07, "loss": -0.0073, "reward": 1.144493117928505, "reward_std": 0.326381828635931, "rewards/accuracy_reward": 0.6989795714616776, "rewards/improved_len_reward_dast": 0.445513516664505, "step": 247 }, { "completion_length": 1779.9234771728516, "epoch": 0.642903434867142, "grad_norm": 0.19009690153217312, "kl": 0.01587677001953125, "learning_rate": 4.05513461333693e-07, "loss": 0.0056, "reward": 1.2144882082939148, "reward_std": 0.3660648465156555, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.45428410917520523, "step": 248 }, { "completion_length": 1680.5816040039062, "epoch": 0.6454957874270901, "grad_norm": 0.18737871436935236, "kl": 0.01519775390625, "learning_rate": 4.016499247913994e-07, "loss": 0.0155, "reward": 1.228882908821106, "reward_std": 0.42849814891815186, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.4584747403860092, "step": 249 }, { "completion_length": 1700.0765075683594, "epoch": 0.6480881399870383, "grad_norm": 0.19083582747427946, "kl": 0.01373291015625, "learning_rate": 3.977986184146328e-07, "loss": 0.0276, "reward": 1.4491282403469086, "reward_std": 0.29963432252407074, "rewards/accuracy_reward": 0.8469387590885162, "rewards/improved_len_reward_dast": 0.6021894812583923, "step": 250 }, { "completion_length": 1699.5050964355469, "epoch": 0.6506804925469863, "grad_norm": 0.18294974628895902, "kl": 0.01318359375, "learning_rate": 3.939598597099022e-07, "loss": -0.0028, "reward": 1.1291119307279587, "reward_std": 0.4640827924013138, "rewards/accuracy_reward": 0.7499999850988388, "rewards/improved_len_reward_dast": 0.3791119046509266, "step": 251 }, { "completion_length": 1555.9489135742188, "epoch": 0.6532728451069345, "grad_norm": 0.2987585035266382, "kl": 0.013702392578125, "learning_rate": 3.9013396514927076e-07, "loss": -0.0182, "reward": 1.2567480206489563, "reward_std": 0.38375869020819664, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.4710337221622467, "step": 252 }, { "completion_length": 2022.5509643554688, "epoch": 0.6558651976668827, "grad_norm": 0.16778625708063813, "kl": 0.0160064697265625, "learning_rate": 3.8632125014426566e-07, "loss": 0.0026, "reward": 1.0748438835144043, "reward_std": 0.3207223527133465, "rewards/accuracy_reward": 0.6836734712123871, "rewards/improved_len_reward_dast": 0.3911704570055008, "step": 253 }, { "completion_length": 2008.7550659179688, "epoch": 0.6584575502268308, "grad_norm": 0.20081517128616475, "kl": 0.017364501953125, "learning_rate": 3.8252202901987474e-07, "loss": -0.0036, "reward": 1.1095408350229263, "reward_std": 0.42732013761997223, "rewards/accuracy_reward": 0.7193877398967743, "rewards/improved_len_reward_dast": 0.39015308022499084, "step": 254 }, { "completion_length": 1753.5305786132812, "epoch": 0.661049902786779, "grad_norm": 0.19286213527020518, "kl": 0.015838623046875, "learning_rate": 3.7873661498863384e-07, "loss": -0.0193, "reward": 1.3401989042758942, "reward_std": 0.44482723623514175, "rewards/accuracy_reward": 0.8367346823215485, "rewards/improved_len_reward_dast": 0.5034642219543457, "step": 255 }, { "completion_length": 1714.8316040039062, "epoch": 0.6636422553467272, "grad_norm": 0.19098352531749854, "kl": 0.015716552734375, "learning_rate": 3.7496532012480463e-07, "loss": -0.0172, "reward": 1.285597413778305, "reward_std": 0.3779995068907738, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.4947810471057892, "step": 256 }, { "completion_length": 1587.0254821777344, "epoch": 0.6662346079066753, "grad_norm": 0.1828164836366847, "kl": 0.01513671875, "learning_rate": 3.7120845533864706e-07, "loss": 0.0165, "reward": 1.2909784018993378, "reward_std": 0.3537175990641117, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.5001621246337891, "step": 257 }, { "completion_length": 1945.8519897460938, "epoch": 0.6688269604666235, "grad_norm": 0.2401064586242113, "kl": 0.018310546875, "learning_rate": 3.6746633035078723e-07, "loss": -0.0254, "reward": 0.9318393021821976, "reward_std": 0.3634992204606533, "rewards/accuracy_reward": 0.6530612260103226, "rewards/improved_len_reward_dast": 0.2787781246006489, "step": 258 }, { "completion_length": 1464.5356903076172, "epoch": 0.6714193130265717, "grad_norm": 0.19897550034047456, "kl": 0.0117645263671875, "learning_rate": 3.63739253666684e-07, "loss": 0.0257, "reward": 1.3326016068458557, "reward_std": 0.25891564041376114, "rewards/accuracy_reward": 0.8469387590885162, "rewards/improved_len_reward_dast": 0.48566286638379097, "step": 259 }, { "completion_length": 2040.6173095703125, "epoch": 0.6740116655865198, "grad_norm": 0.2093225075876704, "kl": 0.01587677001953125, "learning_rate": 3.6002753255119533e-07, "loss": 0.0446, "reward": 1.1549495160579681, "reward_std": 0.6060752719640732, "rewards/accuracy_reward": 0.7295918166637421, "rewards/improved_len_reward_dast": 0.42535772174596786, "step": 260 }, { "completion_length": 1504.892837524414, "epoch": 0.6766040181464679, "grad_norm": 0.2413238757963301, "kl": 0.013092041015625, "learning_rate": 3.5633147300324706e-07, "loss": 0.039, "reward": 1.3253722488880157, "reward_std": 0.22303567081689835, "rewards/accuracy_reward": 0.7755101919174194, "rewards/improved_len_reward_dast": 0.5498620271682739, "step": 261 }, { "completion_length": 1835.6020202636719, "epoch": 0.679196370706416, "grad_norm": 0.1742605810963208, "kl": 0.0152587890625, "learning_rate": 3.526513797306051e-07, "loss": 0.023, "reward": 1.3810910284519196, "reward_std": 0.3878571353852749, "rewards/accuracy_reward": 0.8469387590885162, "rewards/improved_len_reward_dast": 0.5341522693634033, "step": 262 }, { "completion_length": 1934.44384765625, "epoch": 0.6817887232663642, "grad_norm": 0.18402016017590034, "kl": 0.0189971923828125, "learning_rate": 3.489875561247568e-07, "loss": 0.0326, "reward": 1.1064758449792862, "reward_std": 0.5427646264433861, "rewards/accuracy_reward": 0.75, "rewards/improved_len_reward_dast": 0.3564758636057377, "step": 263 }, { "completion_length": 1527.6479187011719, "epoch": 0.6843810758263124, "grad_norm": 0.2535051321853217, "kl": 0.0133209228515625, "learning_rate": 3.453403042358968e-07, "loss": 0.0594, "reward": 1.3837721645832062, "reward_std": 0.3384307250380516, "rewards/accuracy_reward": 0.8571428507566452, "rewards/improved_len_reward_dast": 0.5266292989253998, "step": 264 }, { "completion_length": 1750.1275329589844, "epoch": 0.6869734283862605, "grad_norm": 0.20005193883523226, "kl": 0.014312744140625, "learning_rate": 3.417099247480277e-07, "loss": 0.0069, "reward": 1.1163494735956192, "reward_std": 0.4810503050684929, "rewards/accuracy_reward": 0.7295918166637421, "rewards/improved_len_reward_dast": 0.3867576252669096, "step": 265 }, { "completion_length": 1910.5254821777344, "epoch": 0.6895657809462087, "grad_norm": 0.3018048627256463, "kl": 0.0156402587890625, "learning_rate": 3.3809671695416916e-07, "loss": 0.0357, "reward": 1.147754654288292, "reward_std": 0.5025169178843498, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.3824485056102276, "step": 266 }, { "completion_length": 1284.0663146972656, "epoch": 0.6921581335061568, "grad_norm": 0.18258330323366856, "kl": 0.0092926025390625, "learning_rate": 3.345009787316859e-07, "loss": 0.0015, "reward": 1.4202894866466522, "reward_std": 0.2870555892586708, "rewards/accuracy_reward": 0.8418367058038712, "rewards/improved_len_reward_dast": 0.5784527361392975, "step": 267 }, { "completion_length": 1557.5612030029297, "epoch": 0.694750486066105, "grad_norm": 0.1849700340313966, "kl": 0.012725830078125, "learning_rate": 3.309230065177289e-07, "loss": -0.0079, "reward": 1.4877441823482513, "reward_std": 0.302555400878191, "rewards/accuracy_reward": 0.8622448742389679, "rewards/improved_len_reward_dast": 0.6254993677139282, "step": 268 }, { "completion_length": 1482.5203552246094, "epoch": 0.6973428386260532, "grad_norm": 0.19171071001803489, "kl": 0.0144500732421875, "learning_rate": 3.273630952847971e-07, "loss": -0.0012, "reward": 1.2047373950481415, "reward_std": 0.48537394404411316, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.4445333182811737, "step": 269 }, { "completion_length": 1744.6070861816406, "epoch": 0.6999351911860013, "grad_norm": 0.17132128213246742, "kl": 0.01513671875, "learning_rate": 3.2382153851641996e-07, "loss": 0.0229, "reward": 1.1097373962402344, "reward_std": 0.2911606300622225, "rewards/accuracy_reward": 0.7295918166637421, "rewards/improved_len_reward_dast": 0.38014551997184753, "step": 270 }, { "completion_length": 1705.5968933105469, "epoch": 0.7025275437459495, "grad_norm": 0.2582533948663525, "kl": 0.01708984375, "learning_rate": 3.202986281829616e-07, "loss": 0.045, "reward": 1.3047520220279694, "reward_std": 0.4435114786028862, "rewards/accuracy_reward": 0.8061224222183228, "rewards/improved_len_reward_dast": 0.4986295886337757, "step": 271 }, { "completion_length": 1806.591796875, "epoch": 0.7051198963058976, "grad_norm": 0.17993615347196873, "kl": 0.01581573486328125, "learning_rate": 3.1679465471755106e-07, "loss": 0.016, "reward": 1.2005809843540192, "reward_std": 0.2893667705357075, "rewards/accuracy_reward": 0.7448979467153549, "rewards/improved_len_reward_dast": 0.45568302273750305, "step": 272 }, { "completion_length": 1960.2244262695312, "epoch": 0.7077122488658457, "grad_norm": 0.21394731890393012, "kl": 0.018402099609375, "learning_rate": 3.1330990699213824e-07, "loss": 0.0026, "reward": 1.3150149285793304, "reward_std": 0.32834067940711975, "rewards/accuracy_reward": 0.7602040469646454, "rewards/improved_len_reward_dast": 0.5548108592629433, "step": 273 }, { "completion_length": 1648.7601623535156, "epoch": 0.7103046014257939, "grad_norm": 0.22677843577967902, "kl": 0.0144500732421875, "learning_rate": 3.0984467229367885e-07, "loss": -0.0289, "reward": 1.186056673526764, "reward_std": 0.3048909828066826, "rewards/accuracy_reward": 0.7653061002492905, "rewards/improved_len_reward_dast": 0.42075058072805405, "step": 274 }, { "completion_length": 1631.3876953125, "epoch": 0.712896953985742, "grad_norm": 0.18075852179231652, "kl": 0.0135955810546875, "learning_rate": 3.063992363004503e-07, "loss": -0.0047, "reward": 1.3900758624076843, "reward_std": 0.35281531512737274, "rewards/accuracy_reward": 0.8163264989852905, "rewards/improved_len_reward_dast": 0.5737493187189102, "step": 275 }, { "completion_length": 1794.5203857421875, "epoch": 0.7154893065456902, "grad_norm": 0.20597152512904204, "kl": 0.0141143798828125, "learning_rate": 3.0297388305850004e-07, "loss": 0.0135, "reward": 1.2308696657419205, "reward_std": 0.3947853706777096, "rewards/accuracy_reward": 0.7959183603525162, "rewards/improved_len_reward_dast": 0.434951264411211, "step": 276 }, { "completion_length": 1608.892822265625, "epoch": 0.7180816591056384, "grad_norm": 0.22201185510570046, "kl": 0.0151519775390625, "learning_rate": 2.9956889495822877e-07, "loss": 0.0463, "reward": 1.3714110851287842, "reward_std": 0.41973991319537163, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.549982562661171, "step": 277 }, { "completion_length": 1833.0203552246094, "epoch": 0.7206740116655865, "grad_norm": 0.18677648497687657, "kl": 0.0153656005859375, "learning_rate": 2.961845527111091e-07, "loss": 0.0087, "reward": 1.1960042417049408, "reward_std": 0.35424697771668434, "rewards/accuracy_reward": 0.7499999850988388, "rewards/improved_len_reward_dast": 0.4460042342543602, "step": 278 }, { "completion_length": 1663.1989440917969, "epoch": 0.7232663642255347, "grad_norm": 0.23408313686800128, "kl": 0.0152435302734375, "learning_rate": 2.9282113532654363e-07, "loss": 0.0496, "reward": 1.2954119145870209, "reward_std": 0.4828920140862465, "rewards/accuracy_reward": 0.8265306055545807, "rewards/improved_len_reward_dast": 0.46888134628534317, "step": 279 }, { "completion_length": 1693.0254974365234, "epoch": 0.7258587167854829, "grad_norm": 0.23913668563173046, "kl": 0.019439697265625, "learning_rate": 2.894789200888634e-07, "loss": 0.0174, "reward": 1.4143796861171722, "reward_std": 0.37724653631448746, "rewards/accuracy_reward": 0.8367346674203873, "rewards/improved_len_reward_dast": 0.5776450335979462, "step": 280 }, { "completion_length": 1277.8468780517578, "epoch": 0.728451069345431, "grad_norm": 0.2694215840510146, "kl": 0.0134429931640625, "learning_rate": 2.8615818253446766e-07, "loss": 0.0046, "reward": 1.4540930390357971, "reward_std": 0.3243625983595848, "rewards/accuracy_reward": 0.8775509893894196, "rewards/improved_len_reward_dast": 0.5765420496463776, "step": 281 }, { "completion_length": 1236.0356903076172, "epoch": 0.7310434219053791, "grad_norm": 0.1871177689494516, "kl": 0.0116729736328125, "learning_rate": 2.828591964291093e-07, "loss": 0.0055, "reward": 1.2881307899951935, "reward_std": 0.42027105391025543, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.466702226549387, "step": 282 }, { "completion_length": 1389.3673095703125, "epoch": 0.7336357744653272, "grad_norm": 0.17949852486745174, "kl": 0.0106201171875, "learning_rate": 2.7958223374532363e-07, "loss": -0.029, "reward": 1.2979092001914978, "reward_std": 0.34224472381174564, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.4407663494348526, "step": 283 }, { "completion_length": 1291.64794921875, "epoch": 0.7362281270252754, "grad_norm": 0.20498717449578613, "kl": 0.01025390625, "learning_rate": 2.7632756464000835e-07, "loss": 0.0333, "reward": 1.6148460805416107, "reward_std": 0.25412340462207794, "rewards/accuracy_reward": 0.9234693795442581, "rewards/improved_len_reward_dast": 0.6913766860961914, "step": 284 }, { "completion_length": 1941.4284973144531, "epoch": 0.7388204795852236, "grad_norm": 0.19896247201933293, "kl": 0.019378662109375, "learning_rate": 2.730954574321503e-07, "loss": 0.0303, "reward": 1.0792112797498703, "reward_std": 0.38586486876010895, "rewards/accuracy_reward": 0.7142857015132904, "rewards/improved_len_reward_dast": 0.3649255894124508, "step": 285 }, { "completion_length": 1503.8826446533203, "epoch": 0.7414128321451717, "grad_norm": 0.22350544706234096, "kl": 0.01275634765625, "learning_rate": 2.698861785807055e-07, "loss": 0.0311, "reward": 1.5651328265666962, "reward_std": 0.3553974963724613, "rewards/accuracy_reward": 0.9030612260103226, "rewards/improved_len_reward_dast": 0.6620715856552124, "step": 286 }, { "completion_length": 1731.8214111328125, "epoch": 0.7440051847051199, "grad_norm": 0.23609281842069962, "kl": 0.0157470703125, "learning_rate": 2.6669999266263154e-07, "loss": -0.0306, "reward": 1.1723814904689789, "reward_std": 0.5022178217768669, "rewards/accuracy_reward": 0.7602040767669678, "rewards/improved_len_reward_dast": 0.41217736527323723, "step": 287 }, { "completion_length": 1870.0458679199219, "epoch": 0.7465975372650681, "grad_norm": 0.15632978700328695, "kl": 0.0158843994140625, "learning_rate": 2.635371623510758e-07, "loss": 0.0204, "reward": 1.0800221413373947, "reward_std": 0.2878151945769787, "rewards/accuracy_reward": 0.6887754872441292, "rewards/improved_len_reward_dast": 0.39124663546681404, "step": 288 }, { "completion_length": 1414.2703552246094, "epoch": 0.7491898898250162, "grad_norm": 0.23286966119816113, "kl": 0.0133056640625, "learning_rate": 2.6039794839372066e-07, "loss": -0.0074, "reward": 1.341863602399826, "reward_std": 0.36198627576231956, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.530639074742794, "step": 289 }, { "completion_length": 1749.2295532226562, "epoch": 0.7517822423849644, "grad_norm": 0.17241966258758817, "kl": 0.0135955810546875, "learning_rate": 2.5728260959128614e-07, "loss": -0.0129, "reward": 1.2213443964719772, "reward_std": 0.4387034922838211, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.46114034205675125, "step": 290 }, { "completion_length": 2126.826446533203, "epoch": 0.7543745949449125, "grad_norm": 0.2030042278234921, "kl": 0.018890380859375, "learning_rate": 2.541914027761951e-07, "loss": 0.0435, "reward": 1.1566181033849716, "reward_std": 0.505137488245964, "rewards/accuracy_reward": 0.7244897782802582, "rewards/improved_len_reward_dast": 0.43212827295064926, "step": 291 }, { "completion_length": 1632.0713653564453, "epoch": 0.7569669475048607, "grad_norm": 0.24718377241844533, "kl": 0.016876220703125, "learning_rate": 2.511245827913991e-07, "loss": 0.0421, "reward": 1.2267541885375977, "reward_std": 0.3394501358270645, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.4563460126519203, "step": 292 }, { "completion_length": 1807.6529846191406, "epoch": 0.7595593000648088, "grad_norm": 0.1861047697263272, "kl": 0.01556396484375, "learning_rate": 2.4808240246936866e-07, "loss": -0.0078, "reward": 1.2387667298316956, "reward_std": 0.4819525480270386, "rewards/accuracy_reward": 0.795918345451355, "rewards/improved_len_reward_dast": 0.44284842535853386, "step": 293 }, { "completion_length": 1847.19384765625, "epoch": 0.7621516526247569, "grad_norm": 0.22670935044930915, "kl": 0.018310546875, "learning_rate": 2.450651126112504e-07, "loss": 0.0266, "reward": 1.4322427809238434, "reward_std": 0.2754583992063999, "rewards/accuracy_reward": 0.8418367058038712, "rewards/improved_len_reward_dast": 0.590406060218811, "step": 294 }, { "completion_length": 1595.9795532226562, "epoch": 0.7647440051847051, "grad_norm": 0.20527730505286215, "kl": 0.015838623046875, "learning_rate": 2.4207296196618924e-07, "loss": 0.0242, "reward": 1.3626587092876434, "reward_std": 0.32539451494812965, "rewards/accuracy_reward": 0.7908162921667099, "rewards/improved_len_reward_dast": 0.5718424171209335, "step": 295 }, { "completion_length": 1054.137710571289, "epoch": 0.7673363577446533, "grad_norm": 0.21493362850187817, "kl": 0.0093536376953125, "learning_rate": 2.3910619721082253e-07, "loss": 0.0196, "reward": 1.4152240753173828, "reward_std": 0.35989922285079956, "rewards/accuracy_reward": 0.867346927523613, "rewards/improved_len_reward_dast": 0.5478771775960922, "step": 296 }, { "completion_length": 1474.3367004394531, "epoch": 0.7699287103046014, "grad_norm": 0.20358206304391516, "kl": 0.0144500732421875, "learning_rate": 2.3616506292894282e-07, "loss": 0.0271, "reward": 1.4626062214374542, "reward_std": 0.29278943687677383, "rewards/accuracy_reward": 0.8775510042905807, "rewards/improved_len_reward_dast": 0.5850552245974541, "step": 297 }, { "completion_length": 1752.2295227050781, "epoch": 0.7725210628645496, "grad_norm": 0.1833066106969091, "kl": 0.015289306640625, "learning_rate": 2.332498015913344e-07, "loss": 0.0009, "reward": 1.3457911014556885, "reward_std": 0.2773626856505871, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.5345666632056236, "step": 298 }, { "completion_length": 1325.688720703125, "epoch": 0.7751134154244977, "grad_norm": 0.19517765602950424, "kl": 0.01210784912109375, "learning_rate": 2.303606535357843e-07, "loss": 0.0599, "reward": 1.5037426948547363, "reward_std": 0.26091703958809376, "rewards/accuracy_reward": 0.8775510191917419, "rewards/improved_len_reward_dast": 0.6261917278170586, "step": 299 }, { "completion_length": 1663.0662689208984, "epoch": 0.7777057679844459, "grad_norm": 0.20601240191104908, "kl": 0.01605224609375, "learning_rate": 2.2749785694726685e-07, "loss": 0.0094, "reward": 1.3560754358768463, "reward_std": 0.37762896716594696, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.5346468687057495, "step": 300 }, { "completion_length": 1426.6173095703125, "epoch": 0.7802981205443941, "grad_norm": 0.20108821286385423, "kl": 0.0143585205078125, "learning_rate": 2.2466164783830972e-07, "loss": 0.0207, "reward": 1.3399082869291306, "reward_std": 0.3976980447769165, "rewards/accuracy_reward": 0.806122437119484, "rewards/improved_len_reward_dast": 0.5337858349084854, "step": 301 }, { "completion_length": 1790.8978881835938, "epoch": 0.7828904731043422, "grad_norm": 0.21383459811515595, "kl": 0.0155029296875, "learning_rate": 2.2185226002953483e-07, "loss": 0.0004, "reward": 1.2710506618022919, "reward_std": 0.3618534617125988, "rewards/accuracy_reward": 0.785714253783226, "rewards/improved_len_reward_dast": 0.4853363707661629, "step": 302 }, { "completion_length": 1939.8775024414062, "epoch": 0.7854828256642904, "grad_norm": 0.29379980912133363, "kl": 0.01885986328125, "learning_rate": 2.1906992513038268e-07, "loss": 0.0479, "reward": 1.2805213034152985, "reward_std": 0.4143086224794388, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.4692968502640724, "step": 303 }, { "completion_length": 1614.3775329589844, "epoch": 0.7880751782242384, "grad_norm": 0.17729210448855, "kl": 0.0162353515625, "learning_rate": 2.1631487252001822e-07, "loss": 0.0049, "reward": 1.234568029642105, "reward_std": 0.417904369533062, "rewards/accuracy_reward": 0.8010203838348389, "rewards/improved_len_reward_dast": 0.43354763835668564, "step": 304 }, { "completion_length": 2287.780548095703, "epoch": 0.7906675307841866, "grad_norm": 1.2242934021255432, "kl": 0.021087646484375, "learning_rate": 2.1358732932842032e-07, "loss": 0.0211, "reward": 1.0315402448177338, "reward_std": 0.36217188835144043, "rewards/accuracy_reward": 0.6581632494926453, "rewards/improved_len_reward_dast": 0.3733769580721855, "step": 305 }, { "completion_length": 1723.3673400878906, "epoch": 0.7932598833441348, "grad_norm": 0.20686736211065535, "kl": 0.015533447265625, "learning_rate": 2.1088752041765734e-07, "loss": 0.0319, "reward": 1.3500191867351532, "reward_std": 0.3599831163883209, "rewards/accuracy_reward": 0.8061224222183228, "rewards/improved_len_reward_dast": 0.5438967421650887, "step": 306 }, { "completion_length": 1528.9183654785156, "epoch": 0.7958522359040829, "grad_norm": 0.21573348295043995, "kl": 0.015960693359375, "learning_rate": 2.0821566836334847e-07, "loss": -0.0098, "reward": 1.3639625310897827, "reward_std": 0.3467046692967415, "rewards/accuracy_reward": 0.8469387590885162, "rewards/improved_len_reward_dast": 0.5170237571001053, "step": 307 }, { "completion_length": 1429.280532836914, "epoch": 0.7984445884640311, "grad_norm": 0.18304725042811948, "kl": 0.01262664794921875, "learning_rate": 2.0557199343631494e-07, "loss": 0.0087, "reward": 1.2729185968637466, "reward_std": 0.37279824167490005, "rewards/accuracy_reward": 0.8061224520206451, "rewards/improved_len_reward_dast": 0.4667961820960045, "step": 308 }, { "completion_length": 1876.0458679199219, "epoch": 0.8010369410239793, "grad_norm": 0.20278131778947003, "kl": 0.01853179931640625, "learning_rate": 2.0295671358442033e-07, "loss": 0.019, "reward": 1.3648760467767715, "reward_std": 0.3640540838241577, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.5536516159772873, "step": 309 }, { "completion_length": 1463.239730834961, "epoch": 0.8036292935839274, "grad_norm": 0.22793846718497435, "kl": 0.014312744140625, "learning_rate": 2.0037004441460263e-07, "loss": 0.0287, "reward": 1.3905141055583954, "reward_std": 0.41797252371907234, "rewards/accuracy_reward": 0.8418367207050323, "rewards/improved_len_reward_dast": 0.5486774370074272, "step": 310 }, { "completion_length": 1581.4999542236328, "epoch": 0.8062216461438756, "grad_norm": 0.2080094216762287, "kl": 0.01576995849609375, "learning_rate": 1.9781219917509987e-07, "loss": 0.0138, "reward": 1.4025911092758179, "reward_std": 0.3261520601809025, "rewards/accuracy_reward": 0.8265306055545807, "rewards/improved_len_reward_dast": 0.5760605186223984, "step": 311 }, { "completion_length": 1737.1019897460938, "epoch": 0.8088139987038238, "grad_norm": 0.22193491426249878, "kl": 0.0164794921875, "learning_rate": 1.9528338873786882e-07, "loss": 0.0217, "reward": 1.1316132843494415, "reward_std": 0.44266829639673233, "rewards/accuracy_reward": 0.7397959157824516, "rewards/improved_len_reward_dast": 0.39181735552847385, "step": 312 }, { "completion_length": 1681.6224060058594, "epoch": 0.8114063512637719, "grad_norm": 0.21692033379747663, "kl": 0.0162506103515625, "learning_rate": 1.9278382158120116e-07, "loss": 0.0256, "reward": 1.2757752537727356, "reward_std": 0.447167094796896, "rewards/accuracy_reward": 0.795918345451355, "rewards/improved_len_reward_dast": 0.4798569083213806, "step": 313 }, { "completion_length": 1513.8316040039062, "epoch": 0.81399870382372, "grad_norm": 0.18130741669805844, "kl": 0.01153564453125, "learning_rate": 1.9031370377253574e-07, "loss": 0.0246, "reward": 1.535945862531662, "reward_std": 0.31188252195715904, "rewards/accuracy_reward": 0.8826530426740646, "rewards/improved_len_reward_dast": 0.653292790055275, "step": 314 }, { "completion_length": 1734.6632385253906, "epoch": 0.8165910563836681, "grad_norm": 0.18939277983218827, "kl": 0.0179443359375, "learning_rate": 1.8787323895147052e-07, "loss": -0.001, "reward": 1.1586688458919525, "reward_std": 0.4217538684606552, "rewards/accuracy_reward": 0.7551020234823227, "rewards/improved_len_reward_dast": 0.4035668522119522, "step": 315 }, { "completion_length": 1650.4846496582031, "epoch": 0.8191834089436163, "grad_norm": 0.2171448495391751, "kl": 0.0167999267578125, "learning_rate": 1.8546262831297438e-07, "loss": -0.0121, "reward": 1.464043915271759, "reward_std": 0.3952450007200241, "rewards/accuracy_reward": 0.8724489510059357, "rewards/improved_len_reward_dast": 0.5915949791669846, "step": 316 }, { "completion_length": 1495.3316040039062, "epoch": 0.8217757615035645, "grad_norm": 0.19836205451789388, "kl": 0.0137481689453125, "learning_rate": 1.8308207059079938e-07, "loss": -0.0069, "reward": 1.1547789573669434, "reward_std": 0.41507500410079956, "rewards/accuracy_reward": 0.7704081386327744, "rewards/improved_len_reward_dast": 0.3843708522617817, "step": 317 }, { "completion_length": 1517.8367004394531, "epoch": 0.8243681140635126, "grad_norm": 0.20600261332668526, "kl": 0.0160064697265625, "learning_rate": 1.8073176204109837e-07, "loss": 0.0437, "reward": 1.438821941614151, "reward_std": 0.306551206856966, "rewards/accuracy_reward": 0.8775510042905807, "rewards/improved_len_reward_dast": 0.5612709149718285, "step": 318 }, { "completion_length": 1504.4285278320312, "epoch": 0.8269604666234608, "grad_norm": 0.21261278084781152, "kl": 0.014495849609375, "learning_rate": 1.7841189642624428e-07, "loss": 0.0231, "reward": 1.229389488697052, "reward_std": 0.4350128807127476, "rewards/accuracy_reward": 0.7959183603525162, "rewards/improved_len_reward_dast": 0.4334711404517293, "step": 319 }, { "completion_length": 1672.8316040039062, "epoch": 0.829552819183409, "grad_norm": 0.1943882700904058, "kl": 0.0173492431640625, "learning_rate": 1.7612266499885642e-07, "loss": 0.0464, "reward": 1.5176236629486084, "reward_std": 0.3366955704987049, "rewards/accuracy_reward": 0.8877550810575485, "rewards/improved_len_reward_dast": 0.6298686116933823, "step": 320 }, { "completion_length": 1179.0713653564453, "epoch": 0.8321451717433571, "grad_norm": 0.22615060777330476, "kl": 0.012054443359375, "learning_rate": 1.7386425648603354e-07, "loss": 0.0423, "reward": 1.5581437051296234, "reward_std": 0.234028534963727, "rewards/accuracy_reward": 0.8979591578245163, "rewards/improved_len_reward_dast": 0.6601845473051071, "step": 321 }, { "completion_length": 1385.7346649169922, "epoch": 0.8347375243033053, "grad_norm": 0.18647668905538498, "kl": 0.0132293701171875, "learning_rate": 1.716368570737946e-07, "loss": -0.0176, "reward": 1.5387031435966492, "reward_std": 0.39274929463863373, "rewards/accuracy_reward": 0.9081632643938065, "rewards/improved_len_reward_dast": 0.6305398866534233, "step": 322 }, { "completion_length": 1955.0357055664062, "epoch": 0.8373298768632534, "grad_norm": 0.1871384863519405, "kl": 0.01862335205078125, "learning_rate": 1.6944065039173004e-07, "loss": 0.0282, "reward": 0.9992491155862808, "reward_std": 0.4749828167259693, "rewards/accuracy_reward": 0.6785714030265808, "rewards/improved_len_reward_dast": 0.3206777200102806, "step": 323 }, { "completion_length": 1949.9693298339844, "epoch": 0.8399222294232016, "grad_norm": 0.20078422959231634, "kl": 0.020111083984375, "learning_rate": 1.672758174978622e-07, "loss": 0.0315, "reward": 1.227005422115326, "reward_std": 0.36194342374801636, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.46680130809545517, "step": 324 }, { "completion_length": 1403.64794921875, "epoch": 0.8425145819831497, "grad_norm": 0.20565437549884577, "kl": 0.0128936767578125, "learning_rate": 1.6514253686371917e-07, "loss": 0.0204, "reward": 1.4708826392889023, "reward_std": 0.2500988617539406, "rewards/accuracy_reward": 0.8826530426740646, "rewards/improved_len_reward_dast": 0.5882296115159988, "step": 325 }, { "completion_length": 1667.8264770507812, "epoch": 0.8451069345430978, "grad_norm": 0.21813136540877595, "kl": 0.0157318115234375, "learning_rate": 1.630409843596216e-07, "loss": 0.0307, "reward": 1.3411798775196075, "reward_std": 0.32134104520082474, "rewards/accuracy_reward": 0.8061224222183228, "rewards/improved_len_reward_dast": 0.53505739569664, "step": 326 }, { "completion_length": 1616.2908020019531, "epoch": 0.847699287103046, "grad_norm": 0.1969183257495155, "kl": 0.0156402587890625, "learning_rate": 1.609713332401831e-07, "loss": 0.0085, "reward": 1.2519380450248718, "reward_std": 0.458795890212059, "rewards/accuracy_reward": 0.7806122452020645, "rewards/improved_len_reward_dast": 0.4713258519768715, "step": 327 }, { "completion_length": 1625.6377258300781, "epoch": 0.8502916396629941, "grad_norm": 0.24417535965250406, "kl": 0.0139617919921875, "learning_rate": 1.5893375413002765e-07, "loss": -0.0317, "reward": 1.2513196468353271, "reward_std": 0.47703811526298523, "rewards/accuracy_reward": 0.7704081386327744, "rewards/improved_len_reward_dast": 0.4809115380048752, "step": 328 }, { "completion_length": 2058.948944091797, "epoch": 0.8528839922229423, "grad_norm": 0.19451912015501954, "kl": 0.0210418701171875, "learning_rate": 1.569284150097226e-07, "loss": 0.0377, "reward": 1.2445521801710129, "reward_std": 0.26459160074591637, "rewards/accuracy_reward": 0.7295918315649033, "rewards/improved_len_reward_dast": 0.5149602852761745, "step": 329 }, { "completion_length": 1789.7040405273438, "epoch": 0.8554763447828905, "grad_norm": 0.24266903278771249, "kl": 0.019378662109375, "learning_rate": 1.5495548120193003e-07, "loss": 0.0434, "reward": 1.322462946176529, "reward_std": 0.38080430775880814, "rewards/accuracy_reward": 0.8265305906534195, "rewards/improved_len_reward_dast": 0.49593234062194824, "step": 330 }, { "completion_length": 1468.8213653564453, "epoch": 0.8580686973428386, "grad_norm": 0.1945755306885796, "kl": 0.01294708251953125, "learning_rate": 1.5301511535777784e-07, "loss": 0.0302, "reward": 1.5070666372776031, "reward_std": 0.3562978059053421, "rewards/accuracy_reward": 0.8724489510059357, "rewards/improved_len_reward_dast": 0.6346177160739899, "step": 331 }, { "completion_length": 1581.3825988769531, "epoch": 0.8606610499027868, "grad_norm": 0.29272858693831433, "kl": 0.01812744140625, "learning_rate": 1.5110747744345006e-07, "loss": 0.0122, "reward": 1.3418152332305908, "reward_std": 0.4640466570854187, "rewards/accuracy_reward": 0.8724489659070969, "rewards/improved_len_reward_dast": 0.46936625242233276, "step": 332 }, { "completion_length": 1786.1734313964844, "epoch": 0.863253402462735, "grad_norm": 0.19480551857525122, "kl": 0.019775390625, "learning_rate": 1.4923272472699986e-07, "loss": -0.0042, "reward": 1.1590133309364319, "reward_std": 0.2618263028562069, "rewards/accuracy_reward": 0.7193877398967743, "rewards/improved_len_reward_dast": 0.4396255351603031, "step": 333 }, { "completion_length": 1171.147933959961, "epoch": 0.8658457550226831, "grad_norm": 0.23814232802014945, "kl": 0.013671875, "learning_rate": 1.4739101176538274e-07, "loss": 0.0174, "reward": 1.2705652117729187, "reward_std": 0.3895917683839798, "rewards/accuracy_reward": 0.8367346823215485, "rewards/improved_len_reward_dast": 0.43383053690195084, "step": 334 }, { "completion_length": 1758.0816040039062, "epoch": 0.8684381075826313, "grad_norm": 0.22764969968005389, "kl": 0.0219268798828125, "learning_rate": 1.4558249039171639e-07, "loss": 0.0414, "reward": 1.358829528093338, "reward_std": 0.38345643877983093, "rewards/accuracy_reward": 0.8367346823215485, "rewards/improved_len_reward_dast": 0.5220948457717896, "step": 335 }, { "completion_length": 1889.0509948730469, "epoch": 0.8710304601425793, "grad_norm": 0.22895792507657853, "kl": 0.021484375, "learning_rate": 1.4380730970276195e-07, "loss": 0.0354, "reward": 1.07760888338089, "reward_std": 0.3665538318455219, "rewards/accuracy_reward": 0.6887754797935486, "rewards/improved_len_reward_dast": 0.3888333588838577, "step": 336 }, { "completion_length": 2373.249969482422, "epoch": 0.8736228127025275, "grad_norm": 0.2697468121522664, "kl": 0.026397705078125, "learning_rate": 1.420656160466333e-07, "loss": -0.0102, "reward": 1.0278730392456055, "reward_std": 0.348503515124321, "rewards/accuracy_reward": 0.6938775330781937, "rewards/improved_len_reward_dast": 0.33399548195302486, "step": 337 }, { "completion_length": 1981.8978881835938, "epoch": 0.8762151652624757, "grad_norm": 0.20587316419649823, "kl": 0.0223846435546875, "learning_rate": 1.4035755301073102e-07, "loss": 0.0273, "reward": 1.2939772605895996, "reward_std": 0.46924955397844315, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.5286711901426315, "step": 338 }, { "completion_length": 1536.4336395263672, "epoch": 0.8788075178224238, "grad_norm": 0.20611627730954438, "kl": 0.0202789306640625, "learning_rate": 1.386832614099056e-07, "loss": 0.006, "reward": 1.4531451165676117, "reward_std": 0.3475269414484501, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.5960022807121277, "step": 339 }, { "completion_length": 1489.7652435302734, "epoch": 0.881399870382372, "grad_norm": 0.2223037836334228, "kl": 0.0159454345703125, "learning_rate": 1.3704287927484846e-07, "loss": -0.0138, "reward": 1.3403507471084595, "reward_std": 0.46086446195840836, "rewards/accuracy_reward": 0.8112244606018066, "rewards/improved_len_reward_dast": 0.529126301407814, "step": 340 }, { "completion_length": 1788.7091674804688, "epoch": 0.8839922229423202, "grad_norm": 0.188880858513302, "kl": 0.0198516845703125, "learning_rate": 1.3543654184071186e-07, "loss": 0.0144, "reward": 1.320367306470871, "reward_std": 0.2726456895470619, "rewards/accuracy_reward": 0.7755101919174194, "rewards/improved_len_reward_dast": 0.5448571220040321, "step": 341 }, { "completion_length": 1541.3316192626953, "epoch": 0.8865845755022683, "grad_norm": 0.20649364949795315, "kl": 0.01570892333984375, "learning_rate": 1.3386438153596067e-07, "loss": 0.0104, "reward": 1.327652782201767, "reward_std": 0.3968999646604061, "rewards/accuracy_reward": 0.846938744187355, "rewards/improved_len_reward_dast": 0.4807140678167343, "step": 342 }, { "completion_length": 1504.8775329589844, "epoch": 0.8891769280622165, "grad_norm": 0.23748978746970162, "kl": 0.0181427001953125, "learning_rate": 1.323265279714543e-07, "loss": -0.0172, "reward": 1.3229451477527618, "reward_std": 0.38034195080399513, "rewards/accuracy_reward": 0.8265306055545807, "rewards/improved_len_reward_dast": 0.49641457200050354, "step": 343 }, { "completion_length": 1616.14794921875, "epoch": 0.8917692806221647, "grad_norm": 0.228900632017236, "kl": 0.020263671875, "learning_rate": 1.3082310792976202e-07, "loss": 0.0331, "reward": 1.4383951127529144, "reward_std": 0.32518207281827927, "rewards/accuracy_reward": 0.8520407974720001, "rewards/improved_len_reward_dast": 0.5863542854785919, "step": 344 }, { "completion_length": 1765.0509948730469, "epoch": 0.8943616331821128, "grad_norm": 0.21689615981919957, "kl": 0.0205841064453125, "learning_rate": 1.293542453547102e-07, "loss": 0.0219, "reward": 1.3277872800827026, "reward_std": 0.4930282086133957, "rewards/accuracy_reward": 0.8163264989852905, "rewards/improved_len_reward_dast": 0.5114607587456703, "step": 345 }, { "completion_length": 1576.6071166992188, "epoch": 0.8969539857420609, "grad_norm": 0.2503011086919002, "kl": 0.0197906494140625, "learning_rate": 1.279200613411642e-07, "loss": 0.044, "reward": 1.2905025482177734, "reward_std": 0.47432298958301544, "rewards/accuracy_reward": 0.8214285522699356, "rewards/improved_len_reward_dast": 0.46907395869493484, "step": 346 }, { "completion_length": 2153.3162231445312, "epoch": 0.899546338302009, "grad_norm": 0.23273243697852358, "kl": 0.023712158203125, "learning_rate": 1.2652067412504605e-07, "loss": 0.0312, "reward": 1.047543928027153, "reward_std": 0.3953222408890724, "rewards/accuracy_reward": 0.688775509595871, "rewards/improved_len_reward_dast": 0.35876838117837906, "step": 347 }, { "completion_length": 1542.3111877441406, "epoch": 0.9021386908619572, "grad_norm": 0.25879665856811085, "kl": 0.0159149169921875, "learning_rate": 1.251561990735859e-07, "loss": 0.0306, "reward": 1.4665509164333344, "reward_std": 0.34583452716469765, "rewards/accuracy_reward": 0.867346927523613, "rewards/improved_len_reward_dast": 0.599203959107399, "step": 348 }, { "completion_length": 2166.5713806152344, "epoch": 0.9047310434219054, "grad_norm": 0.21742881103681694, "kl": 0.029144287109375, "learning_rate": 1.238267486758117e-07, "loss": 0.0221, "reward": 0.9765184819698334, "reward_std": 0.4072360023856163, "rewards/accuracy_reward": 0.6224489733576775, "rewards/improved_len_reward_dast": 0.3540695160627365, "step": 349 }, { "completion_length": 1897.44384765625, "epoch": 0.9073233959818535, "grad_norm": 0.20381019828760852, "kl": 0.022857666015625, "learning_rate": 1.2253243253327504e-07, "loss": 0.0392, "reward": 1.2360577583312988, "reward_std": 0.4647463858127594, "rewards/accuracy_reward": 0.7653061151504517, "rewards/improved_len_reward_dast": 0.470751591026783, "step": 350 }, { "completion_length": 1563.9234313964844, "epoch": 0.9099157485418017, "grad_norm": 0.2149667100915999, "kl": 0.01705169677734375, "learning_rate": 1.212733573510154e-07, "loss": 0.0251, "reward": 1.484131395816803, "reward_std": 0.3115840032696724, "rewards/accuracy_reward": 0.867346927523613, "rewards/improved_len_reward_dast": 0.6167844533920288, "step": 351 }, { "completion_length": 1613.438720703125, "epoch": 0.9125081011017498, "grad_norm": 0.2397808119710266, "kl": 0.01849365234375, "learning_rate": 1.20049626928764e-07, "loss": 0.0255, "reward": 1.374268501996994, "reward_std": 0.3617161624133587, "rewards/accuracy_reward": 0.8163264989852905, "rewards/improved_len_reward_dast": 0.5579419583082199, "step": 352 }, { "completion_length": 1810.8724060058594, "epoch": 0.915100453661698, "grad_norm": 0.1952032672447838, "kl": 0.0240478515625, "learning_rate": 1.1886134215238539e-07, "loss": 0.0013, "reward": 1.2345272898674011, "reward_std": 0.4293368086218834, "rewards/accuracy_reward": 0.7602040618658066, "rewards/improved_len_reward_dast": 0.47432321310043335, "step": 353 }, { "completion_length": 1323.6071319580078, "epoch": 0.9176928062216462, "grad_norm": 0.23544630425662993, "kl": 0.0150299072265625, "learning_rate": 1.1770860098556122e-07, "loss": -0.0126, "reward": 1.5638253688812256, "reward_std": 0.3317151963710785, "rewards/accuracy_reward": 0.9234693795442581, "rewards/improved_len_reward_dast": 0.6403559893369675, "step": 354 }, { "completion_length": 1648.1122436523438, "epoch": 0.9202851587815943, "grad_norm": 0.19373617697957926, "kl": 0.01983642578125, "learning_rate": 1.1659149846171314e-07, "loss": -0.0106, "reward": 1.409626692533493, "reward_std": 0.3634777031838894, "rewards/accuracy_reward": 0.8112244606018066, "rewards/improved_len_reward_dast": 0.5984021797776222, "step": 355 }, { "completion_length": 1640.484634399414, "epoch": 0.9228775113415425, "grad_norm": 0.2139648005259324, "kl": 0.02065277099609375, "learning_rate": 1.1551012667616889e-07, "loss": -0.0041, "reward": 1.3790205717086792, "reward_std": 0.3004123643040657, "rewards/accuracy_reward": 0.8010203987360001, "rewards/improved_len_reward_dast": 0.5780001431703568, "step": 356 }, { "completion_length": 1952.6427612304688, "epoch": 0.9254698639014906, "grad_norm": 0.20207361431898127, "kl": 0.027069091796875, "learning_rate": 1.1446457477856933e-07, "loss": 0.0274, "reward": 1.1954913437366486, "reward_std": 0.30133310705423355, "rewards/accuracy_reward": 0.7448979467153549, "rewards/improved_len_reward_dast": 0.450593464076519, "step": 357 }, { "completion_length": 1666.0816040039062, "epoch": 0.9280622164614387, "grad_norm": 0.2020263485504787, "kl": 0.0185546875, "learning_rate": 1.1345492896551908e-07, "loss": -0.0157, "reward": 1.4352505505084991, "reward_std": 0.4688113033771515, "rewards/accuracy_reward": 0.8928571343421936, "rewards/improved_len_reward_dast": 0.542393408715725, "step": 358 }, { "completion_length": 1809.0611877441406, "epoch": 0.9306545690213869, "grad_norm": 0.2096938589768357, "kl": 0.020904541015625, "learning_rate": 1.1248127247348025e-07, "loss": 0.0384, "reward": 1.3605789840221405, "reward_std": 0.35709768906235695, "rewards/accuracy_reward": 0.8163264989852905, "rewards/improved_len_reward_dast": 0.544252522289753, "step": 359 }, { "completion_length": 1797.744857788086, "epoch": 0.933246921581335, "grad_norm": 0.21622133027589538, "kl": 0.02146148681640625, "learning_rate": 1.1154368557191032e-07, "loss": 0.0154, "reward": 1.0935336202383041, "reward_std": 0.3505462594330311, "rewards/accuracy_reward": 0.6938775479793549, "rewards/improved_len_reward_dast": 0.3996560573577881, "step": 360 }, { "completion_length": 1433.0765075683594, "epoch": 0.9358392741412832, "grad_norm": 0.22187489868295793, "kl": 0.0160064697265625, "learning_rate": 1.1064224555664489e-07, "loss": -0.0178, "reward": 1.2581793367862701, "reward_std": 0.4055371508002281, "rewards/accuracy_reward": 0.806122437119484, "rewards/improved_len_reward_dast": 0.4520568624138832, "step": 361 }, { "completion_length": 1678.2703857421875, "epoch": 0.9384316267012314, "grad_norm": 0.18769832722230134, "kl": 0.0196075439453125, "learning_rate": 1.0977702674352485e-07, "loss": 0.0061, "reward": 1.533081442117691, "reward_std": 0.24393456988036633, "rewards/accuracy_reward": 0.8673469126224518, "rewards/improved_len_reward_dast": 0.6657344847917557, "step": 362 }, { "completion_length": 1496.3112030029297, "epoch": 0.9410239792611795, "grad_norm": 0.2409591218430649, "kl": 0.01830291748046875, "learning_rate": 1.0894810046227007e-07, "loss": 0.0454, "reward": 1.3800479769706726, "reward_std": 0.3536526523530483, "rewards/accuracy_reward": 0.8316326439380646, "rewards/improved_len_reward_dast": 0.548415370285511, "step": 363 }, { "completion_length": 1296.9234313964844, "epoch": 0.9436163318211277, "grad_norm": 0.2065960957661233, "kl": 0.014404296875, "learning_rate": 1.0815553505059864e-07, "loss": 0.0346, "reward": 1.4174171388149261, "reward_std": 0.3700226917862892, "rewards/accuracy_reward": 0.8673469126224518, "rewards/improved_len_reward_dast": 0.5500702187418938, "step": 364 }, { "completion_length": 1770.8111572265625, "epoch": 0.9462086843810759, "grad_norm": 0.22025176867987864, "kl": 0.0205535888671875, "learning_rate": 1.0739939584859327e-07, "loss": 0.0372, "reward": 1.2784855961799622, "reward_std": 0.40080468729138374, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.4876692369580269, "step": 365 }, { "completion_length": 2252.9540405273438, "epoch": 0.948801036941024, "grad_norm": 0.25202994466231426, "kl": 0.028900146484375, "learning_rate": 1.066797451933144e-07, "loss": 0.0538, "reward": 1.052029862999916, "reward_std": 0.4297824278473854, "rewards/accuracy_reward": 0.6734693944454193, "rewards/improved_len_reward_dast": 0.37856047973036766, "step": 366 }, { "completion_length": 1675.0867309570312, "epoch": 0.9513933895009722, "grad_norm": 0.18981437618840255, "kl": 0.019775390625, "learning_rate": 1.0599664241366108e-07, "loss": 0.0215, "reward": 1.4016070365905762, "reward_std": 0.4491507261991501, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.5444641783833504, "step": 367 }, { "completion_length": 2051.3162536621094, "epoch": 0.9539857420609202, "grad_norm": 0.18988751309956323, "kl": 0.0218658447265625, "learning_rate": 1.0535014382547976e-07, "loss": -0.0024, "reward": 1.3321772515773773, "reward_std": 0.5532524138689041, "rewards/accuracy_reward": 0.8418367207050323, "rewards/improved_len_reward_dast": 0.4903404861688614, "step": 368 }, { "completion_length": 1725.3927917480469, "epoch": 0.9565780946208684, "grad_norm": 0.26332331622328803, "kl": 0.02056884765625, "learning_rate": 1.0474030272692176e-07, "loss": -0.0428, "reward": 1.1207705438137054, "reward_std": 0.582356795668602, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.33505629003047943, "step": 369 }, { "completion_length": 1730.3264465332031, "epoch": 0.9591704471808166, "grad_norm": 0.23147600575876767, "kl": 0.020355224609375, "learning_rate": 1.0416716939404906e-07, "loss": 0.0207, "reward": 1.4236516058444977, "reward_std": 0.4436470791697502, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.5665087997913361, "step": 370 }, { "completion_length": 2078.234649658203, "epoch": 0.9617627997407647, "grad_norm": 0.18318392619509644, "kl": 0.02490234375, "learning_rate": 1.0363079107668965e-07, "loss": 0.0174, "reward": 1.2476365268230438, "reward_std": 0.4425313174724579, "rewards/accuracy_reward": 0.7704081535339355, "rewards/improved_len_reward_dast": 0.4772283583879471, "step": 371 }, { "completion_length": 1901.7754821777344, "epoch": 0.9643551523007129, "grad_norm": 0.2045058157665467, "kl": 0.0230865478515625, "learning_rate": 1.03131211994542e-07, "loss": 0.0151, "reward": 1.1136702597141266, "reward_std": 0.4208161160349846, "rewards/accuracy_reward": 0.6989795863628387, "rewards/improved_len_reward_dast": 0.41469068080186844, "step": 372 }, { "completion_length": 1673.6377563476562, "epoch": 0.9669475048606611, "grad_norm": 0.1953573582384899, "kl": 0.0203399658203125, "learning_rate": 1.0266847333352986e-07, "loss": 0.0144, "reward": 1.2215417325496674, "reward_std": 0.3687748461961746, "rewards/accuracy_reward": 0.8061224222183228, "rewards/improved_len_reward_dast": 0.4154192693531513, "step": 373 }, { "completion_length": 1465.4744262695312, "epoch": 0.9695398574206092, "grad_norm": 0.2392315039852379, "kl": 0.020263671875, "learning_rate": 1.022426132424064e-07, "loss": 0.0264, "reward": 1.3526732623577118, "reward_std": 0.2864141073077917, "rewards/accuracy_reward": 0.8418367058038712, "rewards/improved_len_reward_dast": 0.5108365193009377, "step": 374 }, { "completion_length": 1698.5611877441406, "epoch": 0.9721322099805574, "grad_norm": 0.22243506530923526, "kl": 0.018157958984375, "learning_rate": 1.0185366682960968e-07, "loss": 0.0368, "reward": 1.2421083450317383, "reward_std": 0.3934044614434242, "rewards/accuracy_reward": 0.7908163070678711, "rewards/improved_len_reward_dast": 0.451292023062706, "step": 375 }, { "completion_length": 1694.5101623535156, "epoch": 0.9747245625405055, "grad_norm": 0.2049483563870167, "kl": 0.02301025390625, "learning_rate": 1.015016661603677e-07, "loss": 0.0109, "reward": 1.2675099819898605, "reward_std": 0.27898336201906204, "rewards/accuracy_reward": 0.806122437119484, "rewards/improved_len_reward_dast": 0.4613875336945057, "step": 376 }, { "completion_length": 1818.9183349609375, "epoch": 0.9773169151004537, "grad_norm": 0.2917301156280802, "kl": 0.022247314453125, "learning_rate": 1.011866402540555e-07, "loss": 0.052, "reward": 1.2979410141706467, "reward_std": 0.4051199574023485, "rewards/accuracy_reward": 0.8010203987360001, "rewards/improved_len_reward_dast": 0.4969206303358078, "step": 377 }, { "completion_length": 1676.4030151367188, "epoch": 0.9799092676604018, "grad_norm": 0.19999847167358073, "kl": 0.0189666748046875, "learning_rate": 1.0090861508180229e-07, "loss": 0.0173, "reward": 1.307900682091713, "reward_std": 0.36051470041275024, "rewards/accuracy_reward": 0.806122437119484, "rewards/improved_len_reward_dast": 0.5017782524228096, "step": 378 }, { "completion_length": 1303.3468933105469, "epoch": 0.9825016202203499, "grad_norm": 0.23002851272315084, "kl": 0.016387939453125, "learning_rate": 1.006676135643506e-07, "loss": 0.0223, "reward": 1.5040651261806488, "reward_std": 0.28981203213334084, "rewards/accuracy_reward": 0.8877550810575485, "rewards/improved_len_reward_dast": 0.6163100153207779, "step": 379 }, { "completion_length": 1699.98974609375, "epoch": 0.9850939727802981, "grad_norm": 0.2773167363062717, "kl": 0.021759033203125, "learning_rate": 1.004636555701666e-07, "loss": -0.0024, "reward": 1.3300544768571854, "reward_std": 0.4332263544201851, "rewards/accuracy_reward": 0.857142835855484, "rewards/improved_len_reward_dast": 0.47291168570518494, "step": 380 }, { "completion_length": 2158.5560607910156, "epoch": 0.9876863253402463, "grad_norm": 0.19893298725270195, "kl": 0.027099609375, "learning_rate": 1.0029675791380211e-07, "loss": 0.0245, "reward": 1.366698831319809, "reward_std": 0.3425176590681076, "rewards/accuracy_reward": 0.8112244755029678, "rewards/improved_len_reward_dast": 0.5554743856191635, "step": 381 }, { "completion_length": 1771.0765075683594, "epoch": 0.9902786779001944, "grad_norm": 0.21454331685840108, "kl": 0.025909423828125, "learning_rate": 1.0016693435450846e-07, "loss": 0.0522, "reward": 1.1434401869773865, "reward_std": 0.518133670091629, "rewards/accuracy_reward": 0.7448979467153549, "rewards/improved_len_reward_dast": 0.39854224771261215, "step": 382 }, { "completion_length": 1916.8673095703125, "epoch": 0.9928710304601426, "grad_norm": 0.21868762838968606, "kl": 0.0216217041015625, "learning_rate": 1.00074195595102e-07, "loss": 0.0149, "reward": 1.2855271100997925, "reward_std": 0.4449741840362549, "rewards/accuracy_reward": 0.7857142686843872, "rewards/improved_len_reward_dast": 0.4998128265142441, "step": 383 }, { "completion_length": 1359.0254821777344, "epoch": 0.9954633830200907, "grad_norm": 0.22146763439588837, "kl": 0.01685333251953125, "learning_rate": 1.0001854928108199e-07, "loss": -0.0267, "reward": 1.3678375780582428, "reward_std": 0.3422878012061119, "rewards/accuracy_reward": 0.8214285671710968, "rewards/improved_len_reward_dast": 0.5464089959859848, "step": 384 }, { "completion_length": 1564.7193908691406, "epoch": 0.9980557355800389, "grad_norm": 0.29725903676415294, "kl": 0.019683837890625, "learning_rate": 1e-07, "loss": 0.0597, "reward": 1.2890927195549011, "reward_std": 0.3781392499804497, "rewards/accuracy_reward": 0.795918345451355, "rewards/improved_len_reward_dast": 0.49317440390586853, "step": 385 }, { "epoch": 0.9980557355800389, "step": 385, "total_flos": 0.0, "train_loss": 0.0015093988140246698, "train_runtime": 5817.5821, "train_samples_per_second": 1.856, "train_steps_per_second": 0.066 } ], "logging_steps": 1, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 7, "trial_name": null, "trial_params": null }