Kadins's picture
Model save
b484df4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9980557355800389,
"eval_steps": 500,
"global_step": 385,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1848.5458984375,
"epoch": 0.002592352559948153,
"grad_norm": 0.15412024450495956,
"kl": 0.0,
"learning_rate": 2.564102564102564e-08,
"loss": 0.0246,
"reward": 1.4397025108337402,
"reward_std": 0.4701927825808525,
"rewards/accuracy_reward": 0.8418367207050323,
"rewards/improved_len_reward_dast": 0.5978657752275467,
"step": 1
},
{
"completion_length": 2130.4540100097656,
"epoch": 0.005184705119896306,
"grad_norm": 0.19408049978062328,
"kl": 0.0,
"learning_rate": 5.128205128205128e-08,
"loss": 0.0596,
"reward": 1.0504228472709656,
"reward_std": 0.31693385541439056,
"rewards/accuracy_reward": 0.6938775479793549,
"rewards/improved_len_reward_dast": 0.3565452881157398,
"step": 2
},
{
"completion_length": 2034.2958679199219,
"epoch": 0.007777057679844459,
"grad_norm": 0.1531077683543166,
"kl": 0.0001348257064819336,
"learning_rate": 7.692307692307692e-08,
"loss": -0.0129,
"reward": 1.0101122856140137,
"reward_std": 0.4455054961144924,
"rewards/accuracy_reward": 0.6581632494926453,
"rewards/improved_len_reward_dast": 0.3519490174949169,
"step": 3
},
{
"completion_length": 2119.744903564453,
"epoch": 0.010369410239792612,
"grad_norm": 0.1349622041652031,
"kl": 0.00012981891632080078,
"learning_rate": 1.0256410256410256e-07,
"loss": -0.0044,
"reward": 1.2723601460456848,
"reward_std": 0.4871401861310005,
"rewards/accuracy_reward": 0.806122437119484,
"rewards/improved_len_reward_dast": 0.46623772382736206,
"step": 4
},
{
"completion_length": 1834.7652893066406,
"epoch": 0.012961762799740765,
"grad_norm": 0.16434839601505108,
"kl": 0.00012123584747314453,
"learning_rate": 1.2820512820512818e-07,
"loss": 0.0443,
"reward": 1.267708569765091,
"reward_std": 0.3166223168373108,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.5024024695158005,
"step": 5
},
{
"completion_length": 2152.540740966797,
"epoch": 0.015554115359688918,
"grad_norm": 0.15696438577129812,
"kl": 0.00012969970703125,
"learning_rate": 1.5384615384615385e-07,
"loss": -0.0129,
"reward": 1.0658827871084213,
"reward_std": 0.4334075152873993,
"rewards/accuracy_reward": 0.7142857164144516,
"rewards/improved_len_reward_dast": 0.35159702971577644,
"step": 6
},
{
"completion_length": 1747.4591674804688,
"epoch": 0.01814646791963707,
"grad_norm": 0.15893508336342455,
"kl": 0.00010186433792114258,
"learning_rate": 1.7948717948717948e-07,
"loss": 0.0429,
"reward": 1.1448375135660172,
"reward_std": 0.37509680539369583,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.3846333734691143,
"step": 7
},
{
"completion_length": 1834.0611572265625,
"epoch": 0.020738820479585224,
"grad_norm": 0.1573166657366275,
"kl": 0.00011396408081054688,
"learning_rate": 2.0512820512820512e-07,
"loss": 0.0036,
"reward": 1.272167608141899,
"reward_std": 0.3015933446586132,
"rewards/accuracy_reward": 0.8010203838348389,
"rewards/improved_len_reward_dast": 0.47114718705415726,
"step": 8
},
{
"completion_length": 2077.1122131347656,
"epoch": 0.023331173039533377,
"grad_norm": 0.15123878128380125,
"kl": 0.0001251697540283203,
"learning_rate": 2.3076923076923078e-07,
"loss": 0.0025,
"reward": 1.1346809566020966,
"reward_std": 0.44101474434137344,
"rewards/accuracy_reward": 0.7448979467153549,
"rewards/improved_len_reward_dast": 0.38978295773267746,
"step": 9
},
{
"completion_length": 2001.6989135742188,
"epoch": 0.02592352559948153,
"grad_norm": 0.15946517595083978,
"kl": 0.00013494491577148438,
"learning_rate": 2.5641025641025636e-07,
"loss": 0.0414,
"reward": 1.0840217173099518,
"reward_std": 0.37720372527837753,
"rewards/accuracy_reward": 0.7244897931814194,
"rewards/improved_len_reward_dast": 0.3595319651067257,
"step": 10
},
{
"completion_length": 2258.3468322753906,
"epoch": 0.028515878159429683,
"grad_norm": 0.16258661653616813,
"kl": 0.0001423358917236328,
"learning_rate": 2.8205128205128203e-07,
"loss": -0.0035,
"reward": 1.035923331975937,
"reward_std": 0.44437722116708755,
"rewards/accuracy_reward": 0.6989795863628387,
"rewards/improved_len_reward_dast": 0.33694368600845337,
"step": 11
},
{
"completion_length": 2071.6019897460938,
"epoch": 0.031108230719377836,
"grad_norm": 0.15520698307030686,
"kl": 0.0001367330551147461,
"learning_rate": 3.076923076923077e-07,
"loss": 0.0151,
"reward": 1.1415546834468842,
"reward_std": 0.37767674773931503,
"rewards/accuracy_reward": 0.7653061002492905,
"rewards/improved_len_reward_dast": 0.3762484937906265,
"step": 12
},
{
"completion_length": 1976.1530151367188,
"epoch": 0.033700583279325985,
"grad_norm": 0.17189810461087038,
"kl": 0.00012564659118652344,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0019,
"reward": 1.125291794538498,
"reward_std": 0.4003720059990883,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.36508774384856224,
"step": 13
},
{
"completion_length": 2114.5612182617188,
"epoch": 0.03629293583927414,
"grad_norm": 0.18307106761606742,
"kl": 0.00011533498764038086,
"learning_rate": 3.5897435897435896e-07,
"loss": 0.0248,
"reward": 1.0526445508003235,
"reward_std": 0.33728349953889847,
"rewards/accuracy_reward": 0.6530612111091614,
"rewards/improved_len_reward_dast": 0.3995833285152912,
"step": 14
},
{
"completion_length": 1440.3571166992188,
"epoch": 0.03888528839922229,
"grad_norm": 0.19219239961861387,
"kl": 7.677078247070312e-05,
"learning_rate": 3.8461538461538463e-07,
"loss": 0.0411,
"reward": 1.3660516738891602,
"reward_std": 0.2804589569568634,
"rewards/accuracy_reward": 0.9030611962080002,
"rewards/improved_len_reward_dast": 0.46299050748348236,
"step": 15
},
{
"completion_length": 1305.2295684814453,
"epoch": 0.04147764095917045,
"grad_norm": 0.18960595204343547,
"kl": 9.632110595703125e-05,
"learning_rate": 4.1025641025641024e-07,
"loss": 0.0021,
"reward": 1.418413519859314,
"reward_std": 0.44618362933397293,
"rewards/accuracy_reward": 0.9132652878761292,
"rewards/improved_len_reward_dast": 0.5051482394337654,
"step": 16
},
{
"completion_length": 1996.841796875,
"epoch": 0.0440699935191186,
"grad_norm": 0.16908596036858053,
"kl": 0.00011074542999267578,
"learning_rate": 4.358974358974359e-07,
"loss": 0.0341,
"reward": 1.1314191222190857,
"reward_std": 0.6118374243378639,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.37121502310037613,
"step": 17
},
{
"completion_length": 1431.4846801757812,
"epoch": 0.046662346079066754,
"grad_norm": 0.22735446925703126,
"kl": 8.571147918701172e-05,
"learning_rate": 4.6153846153846156e-07,
"loss": 0.0407,
"reward": 1.206202208995819,
"reward_std": 0.3719758912920952,
"rewards/accuracy_reward": 0.8469387739896774,
"rewards/improved_len_reward_dast": 0.3592635001987219,
"step": 18
},
{
"completion_length": 1709.688720703125,
"epoch": 0.0492546986390149,
"grad_norm": 0.18577878700500422,
"kl": 0.00010377168655395508,
"learning_rate": 4.871794871794871e-07,
"loss": 0.0417,
"reward": 1.1775241941213608,
"reward_std": 0.5288017690181732,
"rewards/accuracy_reward": 0.7806122303009033,
"rewards/improved_len_reward_dast": 0.39691203087568283,
"step": 19
},
{
"completion_length": 1838.2754821777344,
"epoch": 0.05184705119896306,
"grad_norm": 0.16046849749418657,
"kl": 0.00011777877807617188,
"learning_rate": 5.128205128205127e-07,
"loss": 0.0208,
"reward": 1.1064813733100891,
"reward_std": 0.5807419717311859,
"rewards/accuracy_reward": 0.7551020234823227,
"rewards/improved_len_reward_dast": 0.3513793312013149,
"step": 20
},
{
"completion_length": 2217.14794921875,
"epoch": 0.05443940375891121,
"grad_norm": 0.1963426577198746,
"kl": 0.00014448165893554688,
"learning_rate": 5.384615384615384e-07,
"loss": 0.0467,
"reward": 1.0558834075927734,
"reward_std": 0.558340422809124,
"rewards/accuracy_reward": 0.6887754797935486,
"rewards/improved_len_reward_dast": 0.36710788309574127,
"step": 21
},
{
"completion_length": 1927.3316040039062,
"epoch": 0.057031756318859365,
"grad_norm": 0.18525325793381328,
"kl": 9.930133819580078e-05,
"learning_rate": 5.641025641025641e-07,
"loss": 0.0242,
"reward": 1.1790167838335037,
"reward_std": 0.4690204933285713,
"rewards/accuracy_reward": 0.7857142835855484,
"rewards/improved_len_reward_dast": 0.39330248534679413,
"step": 22
},
{
"completion_length": 1841.6938171386719,
"epoch": 0.059624108878807515,
"grad_norm": 0.17253945143916685,
"kl": 0.00010156631469726562,
"learning_rate": 5.897435897435898e-07,
"loss": 0.0724,
"reward": 1.3324860334396362,
"reward_std": 0.28684910759329796,
"rewards/accuracy_reward": 0.8010203987360001,
"rewards/improved_len_reward_dast": 0.5314656794071198,
"step": 23
},
{
"completion_length": 1679.9642333984375,
"epoch": 0.06221646143875567,
"grad_norm": 0.20870673606371046,
"kl": 0.00012958049774169922,
"learning_rate": 6.153846153846154e-07,
"loss": 0.0467,
"reward": 1.1419631987810135,
"reward_std": 0.38880112022161484,
"rewards/accuracy_reward": 0.8010203838348389,
"rewards/improved_len_reward_dast": 0.3409428298473358,
"step": 24
},
{
"completion_length": 2278.8673095703125,
"epoch": 0.06480881399870382,
"grad_norm": 0.15316366458717245,
"kl": 0.00015485286712646484,
"learning_rate": 6.410256410256411e-07,
"loss": 0.0203,
"reward": 0.9916537553071976,
"reward_std": 0.43884778022766113,
"rewards/accuracy_reward": 0.6479591578245163,
"rewards/improved_len_reward_dast": 0.3436945825815201,
"step": 25
},
{
"completion_length": 1853.4744873046875,
"epoch": 0.06740116655865197,
"grad_norm": 0.1623211083206233,
"kl": 0.0001201629638671875,
"learning_rate": 6.666666666666666e-07,
"loss": 0.054,
"reward": 1.1868394315242767,
"reward_std": 0.4521937184035778,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.42663537338376045,
"step": 26
},
{
"completion_length": 1726.6427917480469,
"epoch": 0.06999351911860013,
"grad_norm": 0.21873628771810408,
"kl": 0.0001125335693359375,
"learning_rate": 6.923076923076922e-07,
"loss": 0.086,
"reward": 1.2924230992794037,
"reward_std": 0.41079702973365784,
"rewards/accuracy_reward": 0.8418367356061935,
"rewards/improved_len_reward_dast": 0.45058638602495193,
"step": 27
},
{
"completion_length": 1667.6071166992188,
"epoch": 0.07258587167854828,
"grad_norm": 0.18905776966101132,
"kl": 0.00011527538299560547,
"learning_rate": 7.179487179487179e-07,
"loss": 0.045,
"reward": 1.2638164162635803,
"reward_std": 0.2763877250254154,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.4525919631123543,
"step": 28
},
{
"completion_length": 2032.4132080078125,
"epoch": 0.07517822423849643,
"grad_norm": 0.15326481666027458,
"kl": 0.00012993812561035156,
"learning_rate": 7.435897435897435e-07,
"loss": 0.0002,
"reward": 1.1888954937458038,
"reward_std": 0.41189244389533997,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.42869146168231964,
"step": 29
},
{
"completion_length": 1764.4999389648438,
"epoch": 0.07777057679844458,
"grad_norm": 0.13723640714210214,
"kl": 9.167194366455078e-05,
"learning_rate": 7.692307692307693e-07,
"loss": -0.0066,
"reward": 1.0674456059932709,
"reward_std": 0.4443123862147331,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.2970374431461096,
"step": 30
},
{
"completion_length": 2198.729522705078,
"epoch": 0.08036292935839275,
"grad_norm": 0.15079546325320037,
"kl": 0.0001614093780517578,
"learning_rate": 7.948717948717948e-07,
"loss": 0.013,
"reward": 1.3089748322963715,
"reward_std": 0.5274734199047089,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.48754626512527466,
"step": 31
},
{
"completion_length": 1879.6376647949219,
"epoch": 0.0829552819183409,
"grad_norm": 0.18155740478939822,
"kl": 0.0001251697540283203,
"learning_rate": 8.205128205128205e-07,
"loss": 0.0131,
"reward": 1.0791111141443253,
"reward_std": 0.46941038966178894,
"rewards/accuracy_reward": 0.7346938699483871,
"rewards/improved_len_reward_dast": 0.34441729076206684,
"step": 32
},
{
"completion_length": 1981.6274719238281,
"epoch": 0.08554763447828904,
"grad_norm": 0.1572483646834791,
"kl": 0.0001424551010131836,
"learning_rate": 8.461538461538461e-07,
"loss": 0.0476,
"reward": 1.3903695046901703,
"reward_std": 0.4975530132651329,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.5332267209887505,
"step": 33
},
{
"completion_length": 2061.9999389648438,
"epoch": 0.0881399870382372,
"grad_norm": 0.1901994694040778,
"kl": 0.0001537799835205078,
"learning_rate": 8.717948717948718e-07,
"loss": 0.0481,
"reward": 1.1052793562412262,
"reward_std": 0.4630768448114395,
"rewards/accuracy_reward": 0.7448979467153549,
"rewards/improved_len_reward_dast": 0.36038143932819366,
"step": 34
},
{
"completion_length": 2465.1224060058594,
"epoch": 0.09073233959818536,
"grad_norm": 0.15096654762075654,
"kl": 0.0001761913299560547,
"learning_rate": 8.974358974358974e-07,
"loss": 0.0009,
"reward": 0.7364223003387451,
"reward_std": 0.4229283332824707,
"rewards/accuracy_reward": 0.5357142835855484,
"rewards/improved_len_reward_dast": 0.20070804562419653,
"step": 35
},
{
"completion_length": 2199.688720703125,
"epoch": 0.09332469215813351,
"grad_norm": 0.1791438585472734,
"kl": 0.0001895427703857422,
"learning_rate": 9.230769230769231e-07,
"loss": 0.0399,
"reward": 1.2042141258716583,
"reward_std": 0.3516070544719696,
"rewards/accuracy_reward": 0.7755101770162582,
"rewards/improved_len_reward_dast": 0.4287039190530777,
"step": 36
},
{
"completion_length": 2019.6478881835938,
"epoch": 0.09591704471808166,
"grad_norm": 0.1921872688604767,
"kl": 0.00020241737365722656,
"learning_rate": 9.487179487179486e-07,
"loss": 0.0187,
"reward": 1.3608680367469788,
"reward_std": 0.4326165243983269,
"rewards/accuracy_reward": 0.8316326439380646,
"rewards/improved_len_reward_dast": 0.5292353481054306,
"step": 37
},
{
"completion_length": 1693.0,
"epoch": 0.0985093972780298,
"grad_norm": 0.19045468187511366,
"kl": 0.0001348257064819336,
"learning_rate": 9.743589743589742e-07,
"loss": 0.0464,
"reward": 1.3455627113580704,
"reward_std": 0.3586850240826607,
"rewards/accuracy_reward": 0.846938744187355,
"rewards/improved_len_reward_dast": 0.49862393736839294,
"step": 38
},
{
"completion_length": 2374.637725830078,
"epoch": 0.10110174983797797,
"grad_norm": 0.13494398794899917,
"kl": 0.0002028942108154297,
"learning_rate": 1e-06,
"loss": 0.0272,
"reward": 0.8414318859577179,
"reward_std": 0.48852086812257767,
"rewards/accuracy_reward": 0.6224489659070969,
"rewards/improved_len_reward_dast": 0.21898294147104025,
"step": 39
},
{
"completion_length": 2517.3162841796875,
"epoch": 0.10369410239792612,
"grad_norm": 0.16744933736297124,
"kl": 0.0002105236053466797,
"learning_rate": 9.99981450718918e-07,
"loss": 0.0616,
"reward": 0.9213714599609375,
"reward_std": 0.43374133110046387,
"rewards/accuracy_reward": 0.6275510042905807,
"rewards/improved_len_reward_dast": 0.2938204384408891,
"step": 40
},
{
"completion_length": 1807.0203857421875,
"epoch": 0.10628645495787427,
"grad_norm": 0.15669439739322064,
"kl": 0.0002703666687011719,
"learning_rate": 9.99925804404898e-07,
"loss": 0.0228,
"reward": 0.994490772485733,
"reward_std": 0.5202224850654602,
"rewards/accuracy_reward": 0.7193877547979355,
"rewards/improved_len_reward_dast": 0.27510301768779755,
"step": 41
},
{
"completion_length": 1907.0305786132812,
"epoch": 0.10887880751782242,
"grad_norm": 0.1507066292700219,
"kl": 0.00019288063049316406,
"learning_rate": 9.998330656454915e-07,
"loss": 0.0566,
"reward": 1.3084075152873993,
"reward_std": 0.3637009263038635,
"rewards/accuracy_reward": 0.8367346823215485,
"rewards/improved_len_reward_dast": 0.4716728553175926,
"step": 42
},
{
"completion_length": 1946.2958984375,
"epoch": 0.11147116007777058,
"grad_norm": 0.21826053334493506,
"kl": 0.0002913475036621094,
"learning_rate": 9.99703242086198e-07,
"loss": 0.0894,
"reward": 1.0715700536966324,
"reward_std": 0.4503963589668274,
"rewards/accuracy_reward": 0.7397958934307098,
"rewards/improved_len_reward_dast": 0.3317741868086159,
"step": 43
},
{
"completion_length": 1862.9591674804688,
"epoch": 0.11406351263771873,
"grad_norm": 0.18297677442826724,
"kl": 0.000263214111328125,
"learning_rate": 9.995363444298333e-07,
"loss": 0.037,
"reward": 1.2490134239196777,
"reward_std": 0.4328879788517952,
"rewards/accuracy_reward": 0.7653061076998711,
"rewards/improved_len_reward_dast": 0.4837072864174843,
"step": 44
},
{
"completion_length": 2316.530517578125,
"epoch": 0.11665586519766688,
"grad_norm": 0.15141936649503004,
"kl": 0.0003380775451660156,
"learning_rate": 9.993323864356492e-07,
"loss": 0.0182,
"reward": 0.7743872255086899,
"reward_std": 0.55930295586586,
"rewards/accuracy_reward": 0.5765305981040001,
"rewards/improved_len_reward_dast": 0.19785663951188326,
"step": 45
},
{
"completion_length": 2924.1683349609375,
"epoch": 0.11924821775761503,
"grad_norm": 0.12614913947783052,
"kl": 0.0002567768096923828,
"learning_rate": 9.990913849181977e-07,
"loss": 0.0096,
"reward": 0.8433035537600517,
"reward_std": 0.41744476184248924,
"rewards/accuracy_reward": 0.5561224333941936,
"rewards/improved_len_reward_dast": 0.28718107007443905,
"step": 46
},
{
"completion_length": 1805.5203552246094,
"epoch": 0.1218405703175632,
"grad_norm": 0.15881163011201838,
"kl": 0.0007009506225585938,
"learning_rate": 9.988133597459444e-07,
"loss": 0.0175,
"reward": 1.1679251790046692,
"reward_std": 0.4487800747156143,
"rewards/accuracy_reward": 0.795918345451355,
"rewards/improved_len_reward_dast": 0.3720068037509918,
"step": 47
},
{
"completion_length": 1873.7499389648438,
"epoch": 0.12443292287751134,
"grad_norm": 0.1713187626068608,
"kl": 0.00028634071350097656,
"learning_rate": 9.984983338396323e-07,
"loss": 0.0488,
"reward": 1.2101139575242996,
"reward_std": 0.33226554840803146,
"rewards/accuracy_reward": 0.760204091668129,
"rewards/improved_len_reward_dast": 0.44990991055965424,
"step": 48
},
{
"completion_length": 1411.4234161376953,
"epoch": 0.1270252754374595,
"grad_norm": 0.18215178056260903,
"kl": 0.0005662441253662109,
"learning_rate": 9.981463331703903e-07,
"loss": 0.0348,
"reward": 1.4565084278583527,
"reward_std": 0.3240164965391159,
"rewards/accuracy_reward": 0.867346927523613,
"rewards/improved_len_reward_dast": 0.5891614705324173,
"step": 49
},
{
"completion_length": 1923.6836547851562,
"epoch": 0.12961762799740764,
"grad_norm": 0.21182741369137464,
"kl": 0.00043964385986328125,
"learning_rate": 9.977573867575937e-07,
"loss": 0.0483,
"reward": 1.0672244429588318,
"reward_std": 0.42784378305077553,
"rewards/accuracy_reward": 0.7244897782802582,
"rewards/improved_len_reward_dast": 0.342734657227993,
"step": 50
},
{
"completion_length": 2293.10205078125,
"epoch": 0.1322099805573558,
"grad_norm": 0.17784622321620705,
"kl": 0.0005965232849121094,
"learning_rate": 9.9733152666647e-07,
"loss": 0.0011,
"reward": 1.119166985154152,
"reward_std": 0.4692757725715637,
"rewards/accuracy_reward": 0.6836734563112259,
"rewards/improved_len_reward_dast": 0.43549349159002304,
"step": 51
},
{
"completion_length": 2606.8468627929688,
"epoch": 0.13480233311730394,
"grad_norm": 0.16188767449887392,
"kl": 0.0004382133483886719,
"learning_rate": 9.968687880054579e-07,
"loss": 0.0355,
"reward": 1.0624671429395676,
"reward_std": 0.5272083953022957,
"rewards/accuracy_reward": 0.6530612111091614,
"rewards/improved_len_reward_dast": 0.4094058535993099,
"step": 52
},
{
"completion_length": 1741.494873046875,
"epoch": 0.1373946856772521,
"grad_norm": 0.18163262147540796,
"kl": 0.0007987022399902344,
"learning_rate": 9.963692089233104e-07,
"loss": 0.0189,
"reward": 1.1586879789829254,
"reward_std": 0.3523149788379669,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.3678716644644737,
"step": 53
},
{
"completion_length": 1731.5713806152344,
"epoch": 0.13998703823720027,
"grad_norm": 0.17545616003222686,
"kl": 0.000713348388671875,
"learning_rate": 9.958328306059508e-07,
"loss": 0.0163,
"reward": 1.087464839220047,
"reward_std": 0.37970298528671265,
"rewards/accuracy_reward": 0.7499999701976776,
"rewards/improved_len_reward_dast": 0.3374648429453373,
"step": 54
},
{
"completion_length": 1940.2244262695312,
"epoch": 0.1425793907971484,
"grad_norm": 0.20829916863603212,
"kl": 0.0008840560913085938,
"learning_rate": 9.952596972730782e-07,
"loss": 0.0418,
"reward": 1.136895164847374,
"reward_std": 0.21965472772717476,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.37158904783427715,
"step": 55
},
{
"completion_length": 2024.3825988769531,
"epoch": 0.14517174335709657,
"grad_norm": 0.16061899047482414,
"kl": 0.0006990432739257812,
"learning_rate": 9.946498561745201e-07,
"loss": 0.0061,
"reward": 1.3091870546340942,
"reward_std": 0.42107394337654114,
"rewards/accuracy_reward": 0.8010203987360001,
"rewards/improved_len_reward_dast": 0.50816660374403,
"step": 56
},
{
"completion_length": 1990.7856750488281,
"epoch": 0.14776409591704473,
"grad_norm": 0.17205784813401187,
"kl": 0.0008096694946289062,
"learning_rate": 9.94003357586339e-07,
"loss": 0.0362,
"reward": 1.3399446904659271,
"reward_std": 0.34059275686740875,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.5185160860419273,
"step": 57
},
{
"completion_length": 2279.331573486328,
"epoch": 0.15035644847699287,
"grad_norm": 0.1637215457597632,
"kl": 0.0006699562072753906,
"learning_rate": 9.933202548066855e-07,
"loss": 0.0424,
"reward": 1.0715169459581375,
"reward_std": 0.39220181107521057,
"rewards/accuracy_reward": 0.6887754946947098,
"rewards/improved_len_reward_dast": 0.38274142518639565,
"step": 58
},
{
"completion_length": 2313.2499084472656,
"epoch": 0.15294880103694103,
"grad_norm": 0.16376379786761341,
"kl": 0.00083160400390625,
"learning_rate": 9.926006041514068e-07,
"loss": 0.0178,
"reward": 1.142714947462082,
"reward_std": 0.3937602676451206,
"rewards/accuracy_reward": 0.739795908331871,
"rewards/improved_len_reward_dast": 0.40291906148195267,
"step": 59
},
{
"completion_length": 2046.1631774902344,
"epoch": 0.15554115359688916,
"grad_norm": 0.23236942157628335,
"kl": 0.0009450912475585938,
"learning_rate": 9.918444649494012e-07,
"loss": 0.0662,
"reward": 1.245220124721527,
"reward_std": 0.2695602234452963,
"rewards/accuracy_reward": 0.7755101770162582,
"rewards/improved_len_reward_dast": 0.46970994770526886,
"step": 60
},
{
"completion_length": 2175.6224060058594,
"epoch": 0.15813350615683733,
"grad_norm": 0.15376927864805173,
"kl": 0.0009765625,
"learning_rate": 9.9105189953773e-07,
"loss": 0.0196,
"reward": 1.2470524311065674,
"reward_std": 0.45635347813367844,
"rewards/accuracy_reward": 0.7653061002492905,
"rewards/improved_len_reward_dast": 0.48174627125263214,
"step": 61
},
{
"completion_length": 2337.1581115722656,
"epoch": 0.1607258587167855,
"grad_norm": 0.15218316765828901,
"kl": 0.0008411407470703125,
"learning_rate": 9.90222973256475e-07,
"loss": 0.0249,
"reward": 1.37412428855896,
"reward_std": 0.39829079806804657,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.552695706486702,
"step": 62
},
{
"completion_length": 2680.4183349609375,
"epoch": 0.16331821127673363,
"grad_norm": 0.21218309711028285,
"kl": 0.0010118484497070312,
"learning_rate": 9.89357754443355e-07,
"loss": 0.0529,
"reward": 0.8223338723182678,
"reward_std": 0.4073232337832451,
"rewards/accuracy_reward": 0.5510203987360001,
"rewards/improved_len_reward_dast": 0.2713134288787842,
"step": 63
},
{
"completion_length": 2635.7550048828125,
"epoch": 0.1659105638366818,
"grad_norm": 0.1620590183136494,
"kl": 0.000949859619140625,
"learning_rate": 9.884563144280897e-07,
"loss": 0.0464,
"reward": 1.0863047987222672,
"reward_std": 0.4714929535984993,
"rewards/accuracy_reward": 0.678571417927742,
"rewards/improved_len_reward_dast": 0.40773337706923485,
"step": 64
},
{
"completion_length": 1972.2907104492188,
"epoch": 0.16850291639662995,
"grad_norm": 0.17935605548712222,
"kl": 0.001079559326171875,
"learning_rate": 9.875187275265198e-07,
"loss": 0.0255,
"reward": 1.2364896833896637,
"reward_std": 0.4289153516292572,
"rewards/accuracy_reward": 0.7959183603525162,
"rewards/improved_len_reward_dast": 0.44057128578424454,
"step": 65
},
{
"completion_length": 2525.2091064453125,
"epoch": 0.1710952689565781,
"grad_norm": 0.14682421707314297,
"kl": 0.0012102127075195312,
"learning_rate": 9.865450710344807e-07,
"loss": 0.0344,
"reward": 0.8753379732370377,
"reward_std": 0.3238606099039316,
"rewards/accuracy_reward": 0.5918367132544518,
"rewards/improved_len_reward_dast": 0.2835012301802635,
"step": 66
},
{
"completion_length": 2308.1478576660156,
"epoch": 0.17368762151652625,
"grad_norm": 0.17311806443951758,
"kl": 0.001552581787109375,
"learning_rate": 9.855354252214307e-07,
"loss": 0.0564,
"reward": 1.152388408780098,
"reward_std": 0.4479888826608658,
"rewards/accuracy_reward": 0.7653061002492905,
"rewards/improved_len_reward_dast": 0.3870823085308075,
"step": 67
},
{
"completion_length": 1699.9540405273438,
"epoch": 0.1762799740764744,
"grad_norm": 0.18795647394996712,
"kl": 0.0012683868408203125,
"learning_rate": 9.844898733238311e-07,
"loss": 0.0538,
"reward": 1.4352277517318726,
"reward_std": 0.30926575139164925,
"rewards/accuracy_reward": 0.867346927523613,
"rewards/improved_len_reward_dast": 0.5678808689117432,
"step": 68
},
{
"completion_length": 1942.3876953125,
"epoch": 0.17887232663642255,
"grad_norm": 0.2210659776524768,
"kl": 0.0016345977783203125,
"learning_rate": 9.83408501538287e-07,
"loss": -0.0183,
"reward": 1.0560709834098816,
"reward_std": 0.44945112615823746,
"rewards/accuracy_reward": 0.7346938699483871,
"rewards/improved_len_reward_dast": 0.32137710228562355,
"step": 69
},
{
"completion_length": 1671.9642639160156,
"epoch": 0.18146467919637072,
"grad_norm": 0.19750773670302219,
"kl": 0.0015382766723632812,
"learning_rate": 9.822913990144387e-07,
"loss": 0.0167,
"reward": 1.1308622658252716,
"reward_std": 0.4337487518787384,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.2737194411456585,
"step": 70
},
{
"completion_length": 2116.3571166992188,
"epoch": 0.18405703175631885,
"grad_norm": 0.1778004806410334,
"kl": 0.00168609619140625,
"learning_rate": 9.811386578476146e-07,
"loss": 0.0029,
"reward": 1.2179836481809616,
"reward_std": 0.46442168205976486,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.457779623568058,
"step": 71
},
{
"completion_length": 1906.9795532226562,
"epoch": 0.18664938431626701,
"grad_norm": 0.1986625505084921,
"kl": 0.001316070556640625,
"learning_rate": 9.79950373071236e-07,
"loss": 0.0285,
"reward": 1.1908049881458282,
"reward_std": 0.3781607896089554,
"rewards/accuracy_reward": 0.7244897931814194,
"rewards/improved_len_reward_dast": 0.4663151800632477,
"step": 72
},
{
"completion_length": 1938.2652587890625,
"epoch": 0.18924173687621518,
"grad_norm": 0.178605084347928,
"kl": 0.001659393310546875,
"learning_rate": 9.787266426489845e-07,
"loss": 0.0145,
"reward": 1.233821153640747,
"reward_std": 0.40631671994924545,
"rewards/accuracy_reward": 0.7704081386327744,
"rewards/improved_len_reward_dast": 0.46341295540332794,
"step": 73
},
{
"completion_length": 2097.5152587890625,
"epoch": 0.1918340894361633,
"grad_norm": 0.21993776817198404,
"kl": 0.0017414093017578125,
"learning_rate": 9.77467567466725e-07,
"loss": 0.0586,
"reward": 1.0030385106801987,
"reward_std": 0.48096026852726936,
"rewards/accuracy_reward": 0.6989795863628387,
"rewards/improved_len_reward_dast": 0.30405890196561813,
"step": 74
},
{
"completion_length": 2267.7193298339844,
"epoch": 0.19442644199611148,
"grad_norm": 0.25966079935566605,
"kl": 0.002155303955078125,
"learning_rate": 9.761732513241882e-07,
"loss": 0.1164,
"reward": 1.1867494583129883,
"reward_std": 0.36580438911914825,
"rewards/accuracy_reward": 0.7346938699483871,
"rewards/improved_len_reward_dast": 0.45205555111169815,
"step": 75
},
{
"completion_length": 1932.4285278320312,
"epoch": 0.1970187945560596,
"grad_norm": 0.18810468542751257,
"kl": 0.0028076171875,
"learning_rate": 9.748438009264142e-07,
"loss": 0.0311,
"reward": 1.302773892879486,
"reward_std": 0.3699945732951164,
"rewards/accuracy_reward": 0.8265306055545807,
"rewards/improved_len_reward_dast": 0.4762432426214218,
"step": 76
},
{
"completion_length": 2192.2601928710938,
"epoch": 0.19961114711600778,
"grad_norm": 0.1818517530996337,
"kl": 0.002178192138671875,
"learning_rate": 9.734793258749538e-07,
"loss": 0.0556,
"reward": 1.2119455933570862,
"reward_std": 0.33562129363417625,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.4517414830625057,
"step": 77
},
{
"completion_length": 2217.4693298339844,
"epoch": 0.20220349967595594,
"grad_norm": 0.17001135134898285,
"kl": 0.002323150634765625,
"learning_rate": 9.720799386588358e-07,
"loss": 0.0214,
"reward": 1.0081346929073334,
"reward_std": 0.5323201268911362,
"rewards/accuracy_reward": 0.6938775479793549,
"rewards/improved_len_reward_dast": 0.3142571374773979,
"step": 78
},
{
"completion_length": 2039.5867309570312,
"epoch": 0.20479585223590407,
"grad_norm": 0.19848985839460778,
"kl": 0.002605438232421875,
"learning_rate": 9.706457546452898e-07,
"loss": 0.0507,
"reward": 1.1386294960975647,
"reward_std": 0.3946889452636242,
"rewards/accuracy_reward": 0.7448979541659355,
"rewards/improved_len_reward_dast": 0.3937314935028553,
"step": 79
},
{
"completion_length": 2590.5305786132812,
"epoch": 0.20738820479585224,
"grad_norm": 0.15129066062202914,
"kl": 0.002803802490234375,
"learning_rate": 9.691768920702379e-07,
"loss": -0.0267,
"reward": 0.8391379117965698,
"reward_std": 0.39438748359680176,
"rewards/accuracy_reward": 0.5765306055545807,
"rewards/improved_len_reward_dast": 0.26260728016495705,
"step": 80
},
{
"completion_length": 2176.096893310547,
"epoch": 0.2099805573558004,
"grad_norm": 0.18394525455650038,
"kl": 0.00240325927734375,
"learning_rate": 9.676734720285456e-07,
"loss": 0.0667,
"reward": 1.148956298828125,
"reward_std": 0.34060123562812805,
"rewards/accuracy_reward": 0.7448979467153549,
"rewards/improved_len_reward_dast": 0.4040583297610283,
"step": 81
},
{
"completion_length": 2104.994842529297,
"epoch": 0.21257290991574854,
"grad_norm": 0.1783774193001553,
"kl": 0.00263214111328125,
"learning_rate": 9.661356184640394e-07,
"loss": 0.0607,
"reward": 1.300699919462204,
"reward_std": 0.29261183738708496,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.5149856060743332,
"step": 82
},
{
"completion_length": 2017.9591674804688,
"epoch": 0.2151652624756967,
"grad_norm": 0.20548002392363018,
"kl": 0.003589630126953125,
"learning_rate": 9.64563458159288e-07,
"loss": 0.0372,
"reward": 1.2817473858594894,
"reward_std": 0.42862868309020996,
"rewards/accuracy_reward": 0.8265305906534195,
"rewards/improved_len_reward_dast": 0.45521679520606995,
"step": 83
},
{
"completion_length": 2365.132568359375,
"epoch": 0.21775761503564484,
"grad_norm": 0.2118006180262065,
"kl": 0.003673553466796875,
"learning_rate": 9.629571207251515e-07,
"loss": 0.0474,
"reward": 1.1858174800872803,
"reward_std": 0.42872869968414307,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.4256134256720543,
"step": 84
},
{
"completion_length": 2227.8111572265625,
"epoch": 0.220349967595593,
"grad_norm": 0.1730257242071835,
"kl": 0.0032958984375,
"learning_rate": 9.613167385900944e-07,
"loss": 0.0116,
"reward": 0.9865487962961197,
"reward_std": 0.30924591794610023,
"rewards/accuracy_reward": 0.6887754946947098,
"rewards/improved_len_reward_dast": 0.2977732727304101,
"step": 85
},
{
"completion_length": 2069.8213806152344,
"epoch": 0.22294232015554116,
"grad_norm": 0.1997054811852766,
"kl": 0.003353118896484375,
"learning_rate": 9.59642446989269e-07,
"loss": 0.0275,
"reward": 1.2090528905391693,
"reward_std": 0.4271962344646454,
"rewards/accuracy_reward": 0.7806122303009033,
"rewards/improved_len_reward_dast": 0.428440660238266,
"step": 86
},
{
"completion_length": 2234.255096435547,
"epoch": 0.2255346727154893,
"grad_norm": 0.1689278406473576,
"kl": 0.0041046142578125,
"learning_rate": 9.579343839533668e-07,
"loss": 0.0395,
"reward": 1.1342998147010803,
"reward_std": 0.3173440955579281,
"rewards/accuracy_reward": 0.739795908331871,
"rewards/improved_len_reward_dast": 0.3945038840174675,
"step": 87
},
{
"completion_length": 2258.3009643554688,
"epoch": 0.22812702527543746,
"grad_norm": 0.19449538540190586,
"kl": 0.004421234130859375,
"learning_rate": 9.561926902972378e-07,
"loss": 0.0785,
"reward": 1.2548484802246094,
"reward_std": 0.3709937259554863,
"rewards/accuracy_reward": 0.7755101770162582,
"rewards/improved_len_reward_dast": 0.47933831810951233,
"step": 88
},
{
"completion_length": 1870.6989440917969,
"epoch": 0.23071937783538563,
"grad_norm": 0.1864398126735164,
"kl": 0.0042266845703125,
"learning_rate": 9.544175096082838e-07,
"loss": 0.0646,
"reward": 1.4300118386745453,
"reward_std": 0.4286029487848282,
"rewards/accuracy_reward": 0.8928571343421936,
"rewards/improved_len_reward_dast": 0.5371547788381577,
"step": 89
},
{
"completion_length": 2082.653045654297,
"epoch": 0.23331173039533376,
"grad_norm": 0.17766778571294792,
"kl": 0.00475311279296875,
"learning_rate": 9.526089882346172e-07,
"loss": 0.032,
"reward": 1.1855316758155823,
"reward_std": 0.36463288590312004,
"rewards/accuracy_reward": 0.7551020085811615,
"rewards/improved_len_reward_dast": 0.4304296597838402,
"step": 90
},
{
"completion_length": 2117.2244262695312,
"epoch": 0.23590408295528192,
"grad_norm": 0.19874233088672905,
"kl": 0.003894805908203125,
"learning_rate": 9.507672752730001e-07,
"loss": 0.052,
"reward": 1.0779342502355576,
"reward_std": 0.45030639320611954,
"rewards/accuracy_reward": 0.734693855047226,
"rewards/improved_len_reward_dast": 0.3432403616607189,
"step": 91
},
{
"completion_length": 2126.6173095703125,
"epoch": 0.23849643551523006,
"grad_norm": 0.20706633281686568,
"kl": 0.004180908203125,
"learning_rate": 9.4889252255655e-07,
"loss": 0.0681,
"reward": 1.1621150970458984,
"reward_std": 0.2173718847334385,
"rewards/accuracy_reward": 0.7295918315649033,
"rewards/improved_len_reward_dast": 0.43252328783273697,
"step": 92
},
{
"completion_length": 2107.4692993164062,
"epoch": 0.24108878807517822,
"grad_norm": 0.18999527082233988,
"kl": 0.00507354736328125,
"learning_rate": 9.469848846422223e-07,
"loss": 0.0305,
"reward": 0.9012731686234474,
"reward_std": 0.2958849798887968,
"rewards/accuracy_reward": 0.6326530501246452,
"rewards/improved_len_reward_dast": 0.2686200775206089,
"step": 93
},
{
"completion_length": 2329.5662841796875,
"epoch": 0.2436811406351264,
"grad_norm": 0.17793830796024995,
"kl": 0.004726409912109375,
"learning_rate": 9.450445187980699e-07,
"loss": 0.0053,
"reward": 1.0069625079631805,
"reward_std": 0.4442039094865322,
"rewards/accuracy_reward": 0.663265272974968,
"rewards/improved_len_reward_dast": 0.3436972051858902,
"step": 94
},
{
"completion_length": 2371.1223754882812,
"epoch": 0.24627349319507452,
"grad_norm": 0.16551461901403783,
"kl": 0.00560760498046875,
"learning_rate": 9.430715849902774e-07,
"loss": 0.0161,
"reward": 1.1833973824977875,
"reward_std": 0.3829594776034355,
"rewards/accuracy_reward": 0.7551020309329033,
"rewards/improved_len_reward_dast": 0.4282953441143036,
"step": 95
},
{
"completion_length": 1950.9897766113281,
"epoch": 0.24886584575502269,
"grad_norm": 0.22225719247681372,
"kl": 0.004608154296875,
"learning_rate": 9.410662458699723e-07,
"loss": 0.0456,
"reward": 1.138383835554123,
"reward_std": 0.32722293585538864,
"rewards/accuracy_reward": 0.7142857015132904,
"rewards/improved_len_reward_dast": 0.4240981712937355,
"step": 96
},
{
"completion_length": 1459.1683349609375,
"epoch": 0.25145819831497085,
"grad_norm": 0.20670520181853694,
"kl": 0.00476837158203125,
"learning_rate": 9.390286667598169e-07,
"loss": 0.0546,
"reward": 1.3123253285884857,
"reward_std": 0.31760613806545734,
"rewards/accuracy_reward": 0.846938744187355,
"rewards/improved_len_reward_dast": 0.4653865396976471,
"step": 97
},
{
"completion_length": 1836.9029846191406,
"epoch": 0.254050550874919,
"grad_norm": 0.20386220038181252,
"kl": 0.00446319580078125,
"learning_rate": 9.369590156403784e-07,
"loss": 0.0339,
"reward": 1.3093420267105103,
"reward_std": 0.42256173491477966,
"rewards/accuracy_reward": 0.8163265138864517,
"rewards/improved_len_reward_dast": 0.49301546812057495,
"step": 98
},
{
"completion_length": 1921.7550354003906,
"epoch": 0.2566429034348671,
"grad_norm": 0.22385072499443348,
"kl": 0.00586700439453125,
"learning_rate": 9.348574631362808e-07,
"loss": 0.0254,
"reward": 1.369395136833191,
"reward_std": 0.292521633207798,
"rewards/accuracy_reward": 0.8367346823215485,
"rewards/improved_len_reward_dast": 0.5326604098081589,
"step": 99
},
{
"completion_length": 1589.2550659179688,
"epoch": 0.2592352559948153,
"grad_norm": 0.23062182502361955,
"kl": 0.003963470458984375,
"learning_rate": 9.327241825021379e-07,
"loss": 0.0939,
"reward": 1.398920476436615,
"reward_std": 0.34097858518362045,
"rewards/accuracy_reward": 0.8979591578245163,
"rewards/improved_len_reward_dast": 0.5009612441062927,
"step": 100
},
{
"completion_length": 1968.3979187011719,
"epoch": 0.26182760855476345,
"grad_norm": 0.19172453408443837,
"kl": 0.0052337646484375,
"learning_rate": 9.3055934960827e-07,
"loss": 0.033,
"reward": 1.2349633574485779,
"reward_std": 0.4557712897658348,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.46455518156290054,
"step": 101
},
{
"completion_length": 2024.6580810546875,
"epoch": 0.2644199611147116,
"grad_norm": 0.18835419471758258,
"kl": 0.00595855712890625,
"learning_rate": 9.283631429262053e-07,
"loss": -0.0018,
"reward": 1.237942174077034,
"reward_std": 0.4386955201625824,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.4522278979420662,
"step": 102
},
{
"completion_length": 2042.0101623535156,
"epoch": 0.2670123136746598,
"grad_norm": 0.16797444756904736,
"kl": 0.00687408447265625,
"learning_rate": 9.261357435139665e-07,
"loss": 0.0127,
"reward": 1.147979348897934,
"reward_std": 0.39860222302377224,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.3877752497792244,
"step": 103
},
{
"completion_length": 1771.6785278320312,
"epoch": 0.2696046662346079,
"grad_norm": 0.19397130084636785,
"kl": 0.00556182861328125,
"learning_rate": 9.238773350011437e-07,
"loss": 0.0329,
"reward": 1.3575038313865662,
"reward_std": 0.28452699072659016,
"rewards/accuracy_reward": 0.8418367356061935,
"rewards/improved_len_reward_dast": 0.5156671032309532,
"step": 104
},
{
"completion_length": 1984.2295532226562,
"epoch": 0.27219701879455604,
"grad_norm": 0.20491481745891912,
"kl": 0.00533294677734375,
"learning_rate": 9.215881035737557e-07,
"loss": 0.0756,
"reward": 1.3917469382286072,
"reward_std": 0.3919885456562042,
"rewards/accuracy_reward": 0.8673469126224518,
"rewards/improved_len_reward_dast": 0.5244000777602196,
"step": 105
},
{
"completion_length": 2123.3570861816406,
"epoch": 0.2747893713545042,
"grad_norm": 0.19107859298960242,
"kl": 0.00609588623046875,
"learning_rate": 9.192682379589017e-07,
"loss": 0.0343,
"reward": 1.3419382572174072,
"reward_std": 0.550883948802948,
"rewards/accuracy_reward": 0.8163265287876129,
"rewards/improved_len_reward_dast": 0.5256116688251495,
"step": 106
},
{
"completion_length": 2321.183563232422,
"epoch": 0.27738172391445237,
"grad_norm": 0.17417279176148165,
"kl": 0.00618743896484375,
"learning_rate": 9.169179294092006e-07,
"loss": 0.037,
"reward": 1.2553168833255768,
"reward_std": 0.3132058009505272,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.49001070857048035,
"step": 107
},
{
"completion_length": 1755.6121826171875,
"epoch": 0.27997407647440054,
"grad_norm": 0.1910812285243796,
"kl": 0.0055389404296875,
"learning_rate": 9.145373716870257e-07,
"loss": 0.0074,
"reward": 1.1911440938711166,
"reward_std": 0.47732261940836906,
"rewards/accuracy_reward": 0.8265305906534195,
"rewards/improved_len_reward_dast": 0.36461350694298744,
"step": 108
},
{
"completion_length": 2498.53564453125,
"epoch": 0.2825664290343487,
"grad_norm": 0.1847398357059974,
"kl": 0.0076904296875,
"learning_rate": 9.121267610485294e-07,
"loss": 0.0136,
"reward": 1.0379046350717545,
"reward_std": 0.5191724747419357,
"rewards/accuracy_reward": 0.6734693795442581,
"rewards/improved_len_reward_dast": 0.36443524062633514,
"step": 109
},
{
"completion_length": 1881.5408020019531,
"epoch": 0.2851587815942968,
"grad_norm": 0.1895141382280174,
"kl": 0.0063629150390625,
"learning_rate": 9.096862962274642e-07,
"loss": -0.0114,
"reward": 1.2222436666488647,
"reward_std": 0.2921589985489845,
"rewards/accuracy_reward": 0.760204054415226,
"rewards/improved_len_reward_dast": 0.4620395749807358,
"step": 110
},
{
"completion_length": 2229.341827392578,
"epoch": 0.28775113415424497,
"grad_norm": 0.16533064618080134,
"kl": 0.00737762451171875,
"learning_rate": 9.072161784187988e-07,
"loss": 0.029,
"reward": 1.213012382388115,
"reward_std": 0.427090298384428,
"rewards/accuracy_reward": 0.795918345451355,
"rewards/improved_len_reward_dast": 0.41709401085972786,
"step": 111
},
{
"completion_length": 1740.8673400878906,
"epoch": 0.29034348671419313,
"grad_norm": 0.17704874550004857,
"kl": 0.00606536865234375,
"learning_rate": 9.047166112621312e-07,
"loss": 0.0232,
"reward": 1.3144700229167938,
"reward_std": 0.3366679251194,
"rewards/accuracy_reward": 0.8163264989852905,
"rewards/improved_len_reward_dast": 0.4981435164809227,
"step": 112
},
{
"completion_length": 2048.397918701172,
"epoch": 0.2929358392741413,
"grad_norm": 0.19568646749424262,
"kl": 0.00690460205078125,
"learning_rate": 9.021878008249001e-07,
"loss": 0.0206,
"reward": 1.1744825094938278,
"reward_std": 0.479649193584919,
"rewards/accuracy_reward": 0.7806122303009033,
"rewards/improved_len_reward_dast": 0.3938702493906021,
"step": 113
},
{
"completion_length": 1883.0255126953125,
"epoch": 0.29552819183408946,
"grad_norm": 0.201863471118327,
"kl": 0.007293701171875,
"learning_rate": 8.996299555853973e-07,
"loss": 0.0263,
"reward": 1.3593637347221375,
"reward_std": 0.3963543549180031,
"rewards/accuracy_reward": 0.8418367207050323,
"rewards/improved_len_reward_dast": 0.5175270512700081,
"step": 114
},
{
"completion_length": 1779.4489135742188,
"epoch": 0.29812054439403757,
"grad_norm": 0.21073286141952957,
"kl": 0.00705718994140625,
"learning_rate": 8.970432864155798e-07,
"loss": 0.059,
"reward": 1.284899353981018,
"reward_std": 0.3950739651918411,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.49408305436372757,
"step": 115
},
{
"completion_length": 1918.2244873046875,
"epoch": 0.30071289695398573,
"grad_norm": 0.19227538961602422,
"kl": 0.00742340087890625,
"learning_rate": 8.944280065636851e-07,
"loss": 0.0454,
"reward": 1.2475728243589401,
"reward_std": 0.32171259075403214,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.4618585482239723,
"step": 116
},
{
"completion_length": 1858.4795532226562,
"epoch": 0.3033052495139339,
"grad_norm": 0.19238271005304078,
"kl": 0.00749969482421875,
"learning_rate": 8.917843316366515e-07,
"loss": 0.0387,
"reward": 1.364868402481079,
"reward_std": 0.2818027026951313,
"rewards/accuracy_reward": 0.8316326439380646,
"rewards/improved_len_reward_dast": 0.533235713839531,
"step": 117
},
{
"completion_length": 1993.6224060058594,
"epoch": 0.30589760207388206,
"grad_norm": 0.231864346111992,
"kl": 0.00769805908203125,
"learning_rate": 8.891124795823426e-07,
"loss": -0.0075,
"reward": 1.1190623342990875,
"reward_std": 0.2991497367620468,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.3282460141927004,
"step": 118
},
{
"completion_length": 1985.5509643554688,
"epoch": 0.3084899546338302,
"grad_norm": 0.17623896225871394,
"kl": 0.00771331787109375,
"learning_rate": 8.864126706715796e-07,
"loss": 0.0186,
"reward": 1.2160087823867798,
"reward_std": 0.35445018485188484,
"rewards/accuracy_reward": 0.7448979467153549,
"rewards/improved_len_reward_dast": 0.4711107425391674,
"step": 119
},
{
"completion_length": 2125.1376953125,
"epoch": 0.31108230719377833,
"grad_norm": 0.2263640313290784,
"kl": 0.0087432861328125,
"learning_rate": 8.83685127479982e-07,
"loss": 0.0941,
"reward": 1.281501442193985,
"reward_std": 0.38218285515904427,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.5110933035612106,
"step": 120
},
{
"completion_length": 1814.5611877441406,
"epoch": 0.3136746597537265,
"grad_norm": 0.19715675281839773,
"kl": 0.007568359375,
"learning_rate": 8.809300748696173e-07,
"loss": 0.0386,
"reward": 1.1133249253034592,
"reward_std": 0.3796735033392906,
"rewards/accuracy_reward": 0.7295918315649033,
"rewards/improved_len_reward_dast": 0.38373304158449173,
"step": 121
},
{
"completion_length": 2427.4489135742188,
"epoch": 0.31626701231367466,
"grad_norm": 0.16760355775672944,
"kl": 0.00905609130859375,
"learning_rate": 8.781477399704652e-07,
"loss": 0.0048,
"reward": 1.0130163729190826,
"reward_std": 0.4051677845418453,
"rewards/accuracy_reward": 0.6632652878761292,
"rewards/improved_len_reward_dast": 0.349751066416502,
"step": 122
},
{
"completion_length": 2251.3570861816406,
"epoch": 0.3188593648736228,
"grad_norm": 0.1882544168870131,
"kl": 0.00846099853515625,
"learning_rate": 8.753383521616902e-07,
"loss": 0.0008,
"reward": 1.1944599151611328,
"reward_std": 0.4080551564693451,
"rewards/accuracy_reward": 0.7499999850988388,
"rewards/improved_len_reward_dast": 0.4444599226117134,
"step": 123
},
{
"completion_length": 1852.142822265625,
"epoch": 0.321451717433571,
"grad_norm": 0.22567456549295617,
"kl": 0.007122039794921875,
"learning_rate": 8.72502143052733e-07,
"loss": 0.0421,
"reward": 1.0371171534061432,
"reward_std": 0.4070936441421509,
"rewards/accuracy_reward": 0.6887754946947098,
"rewards/improved_len_reward_dast": 0.34834159165620804,
"step": 124
},
{
"completion_length": 1902.4897766113281,
"epoch": 0.32404406999351915,
"grad_norm": 0.18976500768952323,
"kl": 0.00728607177734375,
"learning_rate": 8.696393464642158e-07,
"loss": -0.0168,
"reward": 1.379349261522293,
"reward_std": 0.34975893795490265,
"rewards/accuracy_reward": 0.8469387590885162,
"rewards/improved_len_reward_dast": 0.5324105769395828,
"step": 125
},
{
"completion_length": 1687.3979187011719,
"epoch": 0.32663642255346725,
"grad_norm": 0.1842833719422884,
"kl": 0.00609588623046875,
"learning_rate": 8.667501984086655e-07,
"loss": 0.0248,
"reward": 1.3401367366313934,
"reward_std": 0.26001402735710144,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.5544224381446838,
"step": 126
},
{
"completion_length": 1719.23974609375,
"epoch": 0.3292287751134154,
"grad_norm": 0.2122526031093734,
"kl": 0.00665283203125,
"learning_rate": 8.638349370710573e-07,
"loss": 0.0493,
"reward": 1.2587095499038696,
"reward_std": 0.30533889308571815,
"rewards/accuracy_reward": 0.8163264989852905,
"rewards/improved_len_reward_dast": 0.4423830099403858,
"step": 127
},
{
"completion_length": 1702.78564453125,
"epoch": 0.3318211276733636,
"grad_norm": 0.18811783070011717,
"kl": 0.00623321533203125,
"learning_rate": 8.608938027891775e-07,
"loss": 0.0049,
"reward": 1.3044427931308746,
"reward_std": 0.47574885934591293,
"rewards/accuracy_reward": 0.806122437119484,
"rewards/improved_len_reward_dast": 0.49832039326429367,
"step": 128
},
{
"completion_length": 1589.6376953125,
"epoch": 0.33441348023331174,
"grad_norm": 0.2122723729405287,
"kl": 0.007274627685546875,
"learning_rate": 8.579270380338107e-07,
"loss": 0.0378,
"reward": 1.3573221862316132,
"reward_std": 0.40166376531124115,
"rewards/accuracy_reward": 0.8469387590885162,
"rewards/improved_len_reward_dast": 0.510383352637291,
"step": 129
},
{
"completion_length": 2209.2244873046875,
"epoch": 0.3370058327932599,
"grad_norm": 0.18766107651382932,
"kl": 0.0082550048828125,
"learning_rate": 8.549348873887496e-07,
"loss": -0.035,
"reward": 0.9989715814590454,
"reward_std": 0.4630734659731388,
"rewards/accuracy_reward": 0.6734693646430969,
"rewards/improved_len_reward_dast": 0.32550226897001266,
"step": 130
},
{
"completion_length": 1750.2499694824219,
"epoch": 0.339598185353208,
"grad_norm": 0.26668844455154506,
"kl": 0.0062713623046875,
"learning_rate": 8.519175975306312e-07,
"loss": 0.0733,
"reward": 1.0193718448281288,
"reward_std": 0.49021392315626144,
"rewards/accuracy_reward": 0.6989795863628387,
"rewards/improved_len_reward_dast": 0.3203922025859356,
"step": 131
},
{
"completion_length": 1834.892822265625,
"epoch": 0.3421905379131562,
"grad_norm": 0.17123158557193757,
"kl": 0.006275177001953125,
"learning_rate": 8.48875417208601e-07,
"loss": 0.0191,
"reward": 1.2724904865026474,
"reward_std": 0.36864253878593445,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.5020823329687119,
"step": 132
},
{
"completion_length": 1844.9081115722656,
"epoch": 0.34478289047310434,
"grad_norm": 0.1744110793812119,
"kl": 0.00693511962890625,
"learning_rate": 8.458085972238048e-07,
"loss": 0.0332,
"reward": 1.0728662610054016,
"reward_std": 0.4644254148006439,
"rewards/accuracy_reward": 0.7499999850988388,
"rewards/improved_len_reward_dast": 0.3228662498295307,
"step": 133
},
{
"completion_length": 1910.1427917480469,
"epoch": 0.3473752430330525,
"grad_norm": 0.22282630764089068,
"kl": 0.0084686279296875,
"learning_rate": 8.427173904087138e-07,
"loss": 0.0291,
"reward": 1.1172972619533539,
"reward_std": 0.3814988359808922,
"rewards/accuracy_reward": 0.7551020085811615,
"rewards/improved_len_reward_dast": 0.36219523288309574,
"step": 134
},
{
"completion_length": 2461.3775329589844,
"epoch": 0.34996759559300067,
"grad_norm": 0.1595488734110434,
"kl": 0.0104522705078125,
"learning_rate": 8.396020516062794e-07,
"loss": -0.0068,
"reward": 0.9715078249573708,
"reward_std": 0.3740999586880207,
"rewards/accuracy_reward": 0.6173469200730324,
"rewards/improved_len_reward_dast": 0.3541608899831772,
"step": 135
},
{
"completion_length": 1467.096908569336,
"epoch": 0.3525599481529488,
"grad_norm": 0.17905275908990426,
"kl": 0.005458831787109375,
"learning_rate": 8.364628376489242e-07,
"loss": 0.0333,
"reward": 1.558873325586319,
"reward_std": 0.29448162391781807,
"rewards/accuracy_reward": 0.928571417927742,
"rewards/improved_len_reward_dast": 0.6303019374608994,
"step": 136
},
{
"completion_length": 1310.5,
"epoch": 0.35515230071289694,
"grad_norm": 0.20951329036509847,
"kl": 0.0060577392578125,
"learning_rate": 8.333000073373685e-07,
"loss": -0.0166,
"reward": 1.2859368920326233,
"reward_std": 0.3338315784931183,
"rewards/accuracy_reward": 0.8061224520206451,
"rewards/improved_len_reward_dast": 0.47981445118784904,
"step": 137
},
{
"completion_length": 1815.6122436523438,
"epoch": 0.3577446532728451,
"grad_norm": 0.19604752185803775,
"kl": 0.0070953369140625,
"learning_rate": 8.301138214192945e-07,
"loss": 0.0433,
"reward": 1.2342120856046677,
"reward_std": 0.4501468688249588,
"rewards/accuracy_reward": 0.8010203987360001,
"rewards/improved_len_reward_dast": 0.4331916607916355,
"step": 138
},
{
"completion_length": 1862.0764770507812,
"epoch": 0.36033700583279327,
"grad_norm": 0.18709921475186367,
"kl": 0.0084228515625,
"learning_rate": 8.269045425678497e-07,
"loss": -0.011,
"reward": 1.2167351096868515,
"reward_std": 0.3770736940205097,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.45653103291988373,
"step": 139
},
{
"completion_length": 1736.1376953125,
"epoch": 0.36292935839274143,
"grad_norm": 0.19354018571685683,
"kl": 0.0071258544921875,
"learning_rate": 8.236724353599918e-07,
"loss": 0.041,
"reward": 1.496632605791092,
"reward_std": 0.3335278294980526,
"rewards/accuracy_reward": 0.8979591578245163,
"rewards/improved_len_reward_dast": 0.5986734926700592,
"step": 140
},
{
"completion_length": 1628.4183654785156,
"epoch": 0.36552171095268954,
"grad_norm": 0.16803171468726585,
"kl": 0.00705718994140625,
"learning_rate": 8.204177662546763e-07,
"loss": -0.0198,
"reward": 1.2802585661411285,
"reward_std": 0.3480174820870161,
"rewards/accuracy_reward": 0.8163265138864517,
"rewards/improved_len_reward_dast": 0.46393200755119324,
"step": 141
},
{
"completion_length": 1563.2244567871094,
"epoch": 0.3681140635126377,
"grad_norm": 0.21830948983629073,
"kl": 0.006256103515625,
"learning_rate": 8.171408035708906e-07,
"loss": 0.0147,
"reward": 1.477361023426056,
"reward_std": 0.36876992136240005,
"rewards/accuracy_reward": 0.8622448742389679,
"rewards/improved_len_reward_dast": 0.6151161342859268,
"step": 142
},
{
"completion_length": 1426.9744567871094,
"epoch": 0.37070641607258586,
"grad_norm": 0.1829469047156503,
"kl": 0.005870819091796875,
"learning_rate": 8.138418174655323e-07,
"loss": -0.0128,
"reward": 1.475436508655548,
"reward_std": 0.28024090081453323,
"rewards/accuracy_reward": 0.8877550959587097,
"rewards/improved_len_reward_dast": 0.5876814350485802,
"step": 143
},
{
"completion_length": 2269.73974609375,
"epoch": 0.37329876863253403,
"grad_norm": 0.15370768982629232,
"kl": 0.00823974609375,
"learning_rate": 8.105210799111366e-07,
"loss": 0.029,
"reward": 1.0333527326583862,
"reward_std": 0.4238397367298603,
"rewards/accuracy_reward": 0.6632652878761292,
"rewards/improved_len_reward_dast": 0.37008739449083805,
"step": 144
},
{
"completion_length": 1661.2142333984375,
"epoch": 0.3758911211924822,
"grad_norm": 0.1756144937263373,
"kl": 0.006439208984375,
"learning_rate": 8.071788646734564e-07,
"loss": 0.0278,
"reward": 1.297868698835373,
"reward_std": 0.30791742727160454,
"rewards/accuracy_reward": 0.8163265138864517,
"rewards/improved_len_reward_dast": 0.4815421551465988,
"step": 145
},
{
"completion_length": 1629.2754516601562,
"epoch": 0.37848347375243035,
"grad_norm": 0.19753853796416515,
"kl": 0.006805419921875,
"learning_rate": 8.038154472888909e-07,
"loss": -0.0047,
"reward": 1.2643596529960632,
"reward_std": 0.403556901961565,
"rewards/accuracy_reward": 0.806122437119484,
"rewards/improved_len_reward_dast": 0.45823724940419197,
"step": 146
},
{
"completion_length": 1698.1785278320312,
"epoch": 0.38107582631237846,
"grad_norm": 0.18090958864036752,
"kl": 0.00759124755859375,
"learning_rate": 8.004311050417711e-07,
"loss": -0.0063,
"reward": 1.2380123734474182,
"reward_std": 0.39292842149734497,
"rewards/accuracy_reward": 0.7806122153997421,
"rewards/improved_len_reward_dast": 0.4574001543223858,
"step": 147
},
{
"completion_length": 1603.7703704833984,
"epoch": 0.3836681788723266,
"grad_norm": 0.1689548990240542,
"kl": 0.00655364990234375,
"learning_rate": 7.970261169414999e-07,
"loss": 0.0034,
"reward": 1.2632354497909546,
"reward_std": 0.42876998893916607,
"rewards/accuracy_reward": 0.8010203838348389,
"rewards/improved_len_reward_dast": 0.46221502870321274,
"step": 148
},
{
"completion_length": 2111.928497314453,
"epoch": 0.3862605314322748,
"grad_norm": 0.23403462014206552,
"kl": 0.00902557373046875,
"learning_rate": 7.936007636995497e-07,
"loss": 0.0581,
"reward": 1.1535758823156357,
"reward_std": 0.33541079610586166,
"rewards/accuracy_reward": 0.7091836556792259,
"rewards/improved_len_reward_dast": 0.44439224898815155,
"step": 149
},
{
"completion_length": 1584.5560760498047,
"epoch": 0.38885288399222295,
"grad_norm": 0.19966714442908384,
"kl": 0.00608062744140625,
"learning_rate": 7.901553277063213e-07,
"loss": -0.0136,
"reward": 1.0925945341587067,
"reward_std": 0.4660287909209728,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.3323905020952225,
"step": 150
},
{
"completion_length": 1963.030502319336,
"epoch": 0.3914452365521711,
"grad_norm": 0.17996728024183786,
"kl": 0.0086822509765625,
"learning_rate": 7.866900930078618e-07,
"loss": 0.0058,
"reward": 1.245696559548378,
"reward_std": 0.4446266293525696,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.4854924902319908,
"step": 151
},
{
"completion_length": 1893.0254821777344,
"epoch": 0.3940375891121192,
"grad_norm": 0.16735022993158205,
"kl": 0.007110595703125,
"learning_rate": 7.832053452824489e-07,
"loss": 0.0104,
"reward": 1.2418105602264404,
"reward_std": 0.4090575650334358,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.4714023545384407,
"step": 152
},
{
"completion_length": 1724.3111572265625,
"epoch": 0.3966299416720674,
"grad_norm": 0.1864010620729168,
"kl": 0.00872802734375,
"learning_rate": 7.797013718170384e-07,
"loss": 0.0296,
"reward": 1.1897482573986053,
"reward_std": 0.3867075741291046,
"rewards/accuracy_reward": 0.7755101919174194,
"rewards/improved_len_reward_dast": 0.4142380841076374,
"step": 153
},
{
"completion_length": 1520.3673553466797,
"epoch": 0.39922229423201555,
"grad_norm": 0.19558753420229233,
"kl": 0.006317138671875,
"learning_rate": 7.761784614835801e-07,
"loss": -0.0009,
"reward": 1.1826948821544647,
"reward_std": 0.44549785554409027,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.3969806134700775,
"step": 154
},
{
"completion_length": 1902.83154296875,
"epoch": 0.4018146467919637,
"grad_norm": 0.1628442801355898,
"kl": 0.007907867431640625,
"learning_rate": 7.726369047152029e-07,
"loss": 0.0111,
"reward": 1.1829434633255005,
"reward_std": 0.4352233223617077,
"rewards/accuracy_reward": 0.7346938699483871,
"rewards/improved_len_reward_dast": 0.44824954867362976,
"step": 155
},
{
"completion_length": 1687.5867004394531,
"epoch": 0.4044069993519119,
"grad_norm": 0.15254799874290897,
"kl": 0.0055694580078125,
"learning_rate": 7.690769934822712e-07,
"loss": 0.0209,
"reward": 1.3427188694477081,
"reward_std": 0.39824075251817703,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.5212903171777725,
"step": 156
},
{
"completion_length": 1699.2857055664062,
"epoch": 0.40699935191186,
"grad_norm": 0.17162045711276386,
"kl": 0.00756072998046875,
"learning_rate": 7.654990212683142e-07,
"loss": 0.0029,
"reward": 1.3672717213630676,
"reward_std": 0.34800875186920166,
"rewards/accuracy_reward": 0.8520407974720001,
"rewards/improved_len_reward_dast": 0.5152308940887451,
"step": 157
},
{
"completion_length": 1642.4897766113281,
"epoch": 0.40959170447180815,
"grad_norm": 0.17781118941038052,
"kl": 0.0069427490234375,
"learning_rate": 7.619032830458307e-07,
"loss": 0.0238,
"reward": 1.36138716340065,
"reward_std": 0.42799485474824905,
"rewards/accuracy_reward": 0.8520407974720001,
"rewards/improved_len_reward_dast": 0.5093463957309723,
"step": 158
},
{
"completion_length": 2058.10205078125,
"epoch": 0.4121840570317563,
"grad_norm": 0.21486100887413462,
"kl": 0.00844573974609375,
"learning_rate": 7.582900752519723e-07,
"loss": 0.052,
"reward": 1.2367046475410461,
"reward_std": 0.4686100408434868,
"rewards/accuracy_reward": 0.7857142835855484,
"rewards/improved_len_reward_dast": 0.45099035650491714,
"step": 159
},
{
"completion_length": 2116.7601928710938,
"epoch": 0.4147764095917045,
"grad_norm": 0.21872883985010524,
"kl": 0.00928497314453125,
"learning_rate": 7.546596957641031e-07,
"loss": 0.0469,
"reward": 1.1451009958982468,
"reward_std": 0.2814931422472,
"rewards/accuracy_reward": 0.7244897782802582,
"rewards/improved_len_reward_dast": 0.4206111915409565,
"step": 160
},
{
"completion_length": 2057.1172790527344,
"epoch": 0.41736876215165264,
"grad_norm": 0.223277485058984,
"kl": 0.0099639892578125,
"learning_rate": 7.510124438752432e-07,
"loss": 0.0282,
"reward": 1.2358856201171875,
"reward_std": 0.42381204664707184,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.4501713886857033,
"step": 161
},
{
"completion_length": 1648.7907409667969,
"epoch": 0.4199611147116008,
"grad_norm": 0.19361427922643096,
"kl": 0.007965087890625,
"learning_rate": 7.473486202693949e-07,
"loss": 0.0283,
"reward": 1.5626276433467865,
"reward_std": 0.33783891052007675,
"rewards/accuracy_reward": 0.9081632643938065,
"rewards/improved_len_reward_dast": 0.6544643938541412,
"step": 162
},
{
"completion_length": 1720.7805938720703,
"epoch": 0.4225534672715489,
"grad_norm": 0.22042630118078563,
"kl": 0.008636474609375,
"learning_rate": 7.43668526996753e-07,
"loss": 0.0517,
"reward": 1.203346148133278,
"reward_std": 0.48596539348363876,
"rewards/accuracy_reward": 0.7704081386327744,
"rewards/improved_len_reward_dast": 0.43293796479701996,
"step": 163
},
{
"completion_length": 1918.5816345214844,
"epoch": 0.4251458198314971,
"grad_norm": 0.20825217508460148,
"kl": 0.0105438232421875,
"learning_rate": 7.399724674488046e-07,
"loss": 0.0313,
"reward": 1.2619640827178955,
"reward_std": 0.3394176550209522,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.49665799736976624,
"step": 164
},
{
"completion_length": 1879.0867004394531,
"epoch": 0.42773817239144524,
"grad_norm": 0.20859456410748778,
"kl": 0.00949859619140625,
"learning_rate": 7.36260746333316e-07,
"loss": 0.1032,
"reward": 1.250516802072525,
"reward_std": 0.21495914831757545,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.48521073907613754,
"step": 165
},
{
"completion_length": 1788.2040405273438,
"epoch": 0.4303305249513934,
"grad_norm": 0.19365279193672524,
"kl": 0.00925445556640625,
"learning_rate": 7.325336696492128e-07,
"loss": 0.031,
"reward": 1.3934488892555237,
"reward_std": 0.3679058402776718,
"rewards/accuracy_reward": 0.867346927523613,
"rewards/improved_len_reward_dast": 0.5261020287871361,
"step": 166
},
{
"completion_length": 2040.7346801757812,
"epoch": 0.43292287751134156,
"grad_norm": 0.1746728685861396,
"kl": 0.010894775390625,
"learning_rate": 7.287915446613531e-07,
"loss": 0.0021,
"reward": 1.270061433315277,
"reward_std": 0.3740099295973778,
"rewards/accuracy_reward": 0.8061224222183228,
"rewards/improved_len_reward_dast": 0.46393903344869614,
"step": 167
},
{
"completion_length": 2118.234649658203,
"epoch": 0.43551523007128967,
"grad_norm": 0.20129074148639173,
"kl": 0.013275146484375,
"learning_rate": 7.250346798751953e-07,
"loss": 0.006,
"reward": 0.9839373528957367,
"reward_std": 0.581517793238163,
"rewards/accuracy_reward": 0.6785714030265808,
"rewards/improved_len_reward_dast": 0.3053659498691559,
"step": 168
},
{
"completion_length": 1795.9540252685547,
"epoch": 0.43810758263123784,
"grad_norm": 0.1813953032982878,
"kl": 0.009395599365234375,
"learning_rate": 7.212633850113662e-07,
"loss": 0.0235,
"reward": 1.178409919142723,
"reward_std": 0.4242382049560547,
"rewards/accuracy_reward": 0.734693855047226,
"rewards/improved_len_reward_dast": 0.44371599704027176,
"step": 169
},
{
"completion_length": 1421.1734619140625,
"epoch": 0.440699935191186,
"grad_norm": 0.18794137958282095,
"kl": 0.008941650390625,
"learning_rate": 7.174779709801253e-07,
"loss": 0.0159,
"reward": 1.4234746396541595,
"reward_std": 0.32885606586933136,
"rewards/accuracy_reward": 0.8622448742389679,
"rewards/improved_len_reward_dast": 0.5612297654151917,
"step": 170
},
{
"completion_length": 1736.6632690429688,
"epoch": 0.44329228775113416,
"grad_norm": 0.22796049151575712,
"kl": 0.009891510009765625,
"learning_rate": 7.136787498557344e-07,
"loss": 0.0088,
"reward": 1.3514071702957153,
"reward_std": 0.40995020419359207,
"rewards/accuracy_reward": 0.846938744187355,
"rewards/improved_len_reward_dast": 0.5044683739542961,
"step": 171
},
{
"completion_length": 1768.7193603515625,
"epoch": 0.4458846403110823,
"grad_norm": 0.25032479837006205,
"kl": 0.010284423828125,
"learning_rate": 7.098660348507293e-07,
"loss": 0.0732,
"reward": 1.269765853881836,
"reward_std": 0.46360351890325546,
"rewards/accuracy_reward": 0.7704081386327744,
"rewards/improved_len_reward_dast": 0.4993576854467392,
"step": 172
},
{
"completion_length": 1956.9999694824219,
"epoch": 0.44847699287103043,
"grad_norm": 0.17507117871432235,
"kl": 0.0093231201171875,
"learning_rate": 7.060401402900977e-07,
"loss": 0.0185,
"reward": 1.1613440364599228,
"reward_std": 0.5052430480718613,
"rewards/accuracy_reward": 0.739795908331871,
"rewards/improved_len_reward_dast": 0.42154809460043907,
"step": 173
},
{
"completion_length": 1834.2601623535156,
"epoch": 0.4510693454309786,
"grad_norm": 0.19217203672529928,
"kl": 0.01007843017578125,
"learning_rate": 7.022013815853672e-07,
"loss": 0.0209,
"reward": 1.0959883034229279,
"reward_std": 0.47629018872976303,
"rewards/accuracy_reward": 0.7295918166637421,
"rewards/improved_len_reward_dast": 0.3663964793086052,
"step": 174
},
{
"completion_length": 1817.4489440917969,
"epoch": 0.45366169799092676,
"grad_norm": 0.19322905501288215,
"kl": 0.01153564453125,
"learning_rate": 6.983500752086006e-07,
"loss": 0.0448,
"reward": 1.2833284437656403,
"reward_std": 0.43457718193531036,
"rewards/accuracy_reward": 0.795918345451355,
"rewards/improved_len_reward_dast": 0.4874100536108017,
"step": 175
},
{
"completion_length": 1651.7244873046875,
"epoch": 0.4562540505508749,
"grad_norm": 0.19443121591302054,
"kl": 0.00969696044921875,
"learning_rate": 6.94486538666307e-07,
"loss": 0.0327,
"reward": 1.254166454076767,
"reward_std": 0.4054510071873665,
"rewards/accuracy_reward": 0.7806122452020645,
"rewards/improved_len_reward_dast": 0.47355421632528305,
"step": 176
},
{
"completion_length": 1690.4234313964844,
"epoch": 0.4588464031108231,
"grad_norm": 0.2099852909442493,
"kl": 0.0092010498046875,
"learning_rate": 6.906110904732656e-07,
"loss": -0.0115,
"reward": 1.3241359293460846,
"reward_std": 0.4749620705842972,
"rewards/accuracy_reward": 0.8163265138864517,
"rewards/improved_len_reward_dast": 0.5078093633055687,
"step": 177
},
{
"completion_length": 2150.1529541015625,
"epoch": 0.46143875567077125,
"grad_norm": 0.16262254100217993,
"kl": 0.01073455810546875,
"learning_rate": 6.867240501262666e-07,
"loss": 0.0219,
"reward": 1.3224327564239502,
"reward_std": 0.31201132386922836,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.5622286796569824,
"step": 178
},
{
"completion_length": 1616.73974609375,
"epoch": 0.46403110823071936,
"grad_norm": 0.2054857790671321,
"kl": 0.010406494140625,
"learning_rate": 6.828257380777723e-07,
"loss": -0.0028,
"reward": 1.2023987025022507,
"reward_std": 0.38464218378067017,
"rewards/accuracy_reward": 0.8214285671710968,
"rewards/improved_len_reward_dast": 0.38097016140818596,
"step": 179
},
{
"completion_length": 1939.9744567871094,
"epoch": 0.4666234607906675,
"grad_norm": 0.18969129476831767,
"kl": 0.0137481689453125,
"learning_rate": 6.789164757094978e-07,
"loss": 0.035,
"reward": 1.1967380195856094,
"reward_std": 0.3427240923047066,
"rewards/accuracy_reward": 0.734693855047226,
"rewards/improved_len_reward_dast": 0.4620441570878029,
"step": 180
},
{
"completion_length": 1848.25,
"epoch": 0.4692158133506157,
"grad_norm": 0.18668896975291646,
"kl": 0.011810302734375,
"learning_rate": 6.749965853059164e-07,
"loss": 0.0536,
"reward": 1.3282198309898376,
"reward_std": 0.4290488064289093,
"rewards/accuracy_reward": 0.8520407974720001,
"rewards/improved_len_reward_dast": 0.47617900371551514,
"step": 181
},
{
"completion_length": 1659.9489440917969,
"epoch": 0.47180816591056385,
"grad_norm": 0.2068391235436955,
"kl": 0.0099334716796875,
"learning_rate": 6.710663900276903e-07,
"loss": 0.0149,
"reward": 1.1044558137655258,
"reward_std": 0.389005184173584,
"rewards/accuracy_reward": 0.7244897931814194,
"rewards/improved_len_reward_dast": 0.37996600940823555,
"step": 182
},
{
"completion_length": 1548.0152893066406,
"epoch": 0.474400518470512,
"grad_norm": 0.19942963085334378,
"kl": 0.00998687744140625,
"learning_rate": 6.671262138850274e-07,
"loss": 0.0277,
"reward": 1.4036801755428314,
"reward_std": 0.325181283056736,
"rewards/accuracy_reward": 0.846938744187355,
"rewards/improved_len_reward_dast": 0.5567413941025734,
"step": 183
},
{
"completion_length": 1479.9234619140625,
"epoch": 0.4769928710304601,
"grad_norm": 0.17528837750916904,
"kl": 0.00907135009765625,
"learning_rate": 6.631763817109717e-07,
"loss": 0.0212,
"reward": 1.4963186979293823,
"reward_std": 0.2380654364824295,
"rewards/accuracy_reward": 0.8826530426740646,
"rewards/improved_len_reward_dast": 0.6136656627058983,
"step": 184
},
{
"completion_length": 1625.2856750488281,
"epoch": 0.4795852235904083,
"grad_norm": 0.2340295745334256,
"kl": 0.00994873046875,
"learning_rate": 6.592172191346218e-07,
"loss": 0.0387,
"reward": 1.3299905359745026,
"reward_std": 0.4121420457959175,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.5085620209574699,
"step": 185
},
{
"completion_length": 1799.586669921875,
"epoch": 0.48217757615035645,
"grad_norm": 0.208310701570096,
"kl": 0.012359619140625,
"learning_rate": 6.552490525542864e-07,
"loss": 0.0341,
"reward": 1.2161507308483124,
"reward_std": 0.3565462492406368,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.4559466913342476,
"step": 186
},
{
"completion_length": 1612.836685180664,
"epoch": 0.4847699287103046,
"grad_norm": 0.1767048426760215,
"kl": 0.0106048583984375,
"learning_rate": 6.512722091105757e-07,
"loss": -0.0013,
"reward": 1.3248589038848877,
"reward_std": 0.45474397391080856,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.5136343911290169,
"step": 187
},
{
"completion_length": 1306.5509796142578,
"epoch": 0.4873622812702528,
"grad_norm": 0.212241902185087,
"kl": 0.00981903076171875,
"learning_rate": 6.472870166594314e-07,
"loss": 0.0047,
"reward": 1.4141908586025238,
"reward_std": 0.4169772267341614,
"rewards/accuracy_reward": 0.8418367058038712,
"rewards/improved_len_reward_dast": 0.5723541006445885,
"step": 188
},
{
"completion_length": 1914.642822265625,
"epoch": 0.4899546338302009,
"grad_norm": 0.2520686184939368,
"kl": 0.0127410888671875,
"learning_rate": 6.432938037450974e-07,
"loss": -0.0237,
"reward": 1.1971821933984756,
"reward_std": 0.3514118604362011,
"rewards/accuracy_reward": 0.7499999850988388,
"rewards/improved_len_reward_dast": 0.44718217849731445,
"step": 189
},
{
"completion_length": 1808.9183349609375,
"epoch": 0.49254698639014904,
"grad_norm": 0.2130749709969565,
"kl": 0.01201629638671875,
"learning_rate": 6.392928995730352e-07,
"loss": 0.0412,
"reward": 1.2710473388433456,
"reward_std": 0.3865230418741703,
"rewards/accuracy_reward": 0.7908163219690323,
"rewards/improved_len_reward_dast": 0.48023101314902306,
"step": 190
},
{
"completion_length": 1365.4795837402344,
"epoch": 0.4951393389500972,
"grad_norm": 0.250237755024117,
"kl": 0.00952911376953125,
"learning_rate": 6.352846339827826e-07,
"loss": 0.095,
"reward": 1.5109961926937103,
"reward_std": 0.30784352123737335,
"rewards/accuracy_reward": 0.9132653027772903,
"rewards/improved_len_reward_dast": 0.5977308824658394,
"step": 191
},
{
"completion_length": 1425.2755126953125,
"epoch": 0.49773169151004537,
"grad_norm": 0.22368363257945995,
"kl": 0.0114288330078125,
"learning_rate": 6.312693374207627e-07,
"loss": 0.0195,
"reward": 1.2838004529476166,
"reward_std": 0.46850764751434326,
"rewards/accuracy_reward": 0.8265306055545807,
"rewards/improved_len_reward_dast": 0.4572698399424553,
"step": 192
},
{
"completion_length": 1588.5101623535156,
"epoch": 0.5003240440699935,
"grad_norm": 0.20204139731047027,
"kl": 0.01300048828125,
"learning_rate": 6.272473409130397e-07,
"loss": 0.0012,
"reward": 1.3159003108739853,
"reward_std": 0.4093224108219147,
"rewards/accuracy_reward": 0.8316326439380646,
"rewards/improved_len_reward_dast": 0.484267670661211,
"step": 193
},
{
"completion_length": 1411.3571166992188,
"epoch": 0.5029163966299417,
"grad_norm": 0.19443397701968118,
"kl": 0.00821685791015625,
"learning_rate": 6.232189760380301e-07,
"loss": 0.0224,
"reward": 1.288124531507492,
"reward_std": 0.3209230378270149,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.5024102553725243,
"step": 194
},
{
"completion_length": 1751.6785278320312,
"epoch": 0.5055087491898899,
"grad_norm": 0.18304814418314927,
"kl": 0.0109100341796875,
"learning_rate": 6.191845748991671e-07,
"loss": -0.007,
"reward": 1.0736610293388367,
"reward_std": 0.32857421785593033,
"rewards/accuracy_reward": 0.6581632494926453,
"rewards/improved_len_reward_dast": 0.41549770161509514,
"step": 195
},
{
"completion_length": 1771.5968933105469,
"epoch": 0.508101101749838,
"grad_norm": 0.20612952277089522,
"kl": 0.0137939453125,
"learning_rate": 6.151444700975203e-07,
"loss": 0.0106,
"reward": 1.360820233821869,
"reward_std": 0.38221075385808945,
"rewards/accuracy_reward": 0.8418367207050323,
"rewards/improved_len_reward_dast": 0.518983505666256,
"step": 196
},
{
"completion_length": 2076.3060913085938,
"epoch": 0.5106934543097861,
"grad_norm": 0.22320859434163112,
"kl": 0.0132293701171875,
"learning_rate": 6.110989947043767e-07,
"loss": 0.0519,
"reward": 1.101119041442871,
"reward_std": 0.4651700109243393,
"rewards/accuracy_reward": 0.7244897931814194,
"rewards/improved_len_reward_dast": 0.37662921100854874,
"step": 197
},
{
"completion_length": 1513.6530151367188,
"epoch": 0.5132858068697342,
"grad_norm": 0.24160481879222073,
"kl": 0.0120849609375,
"learning_rate": 6.070484822337816e-07,
"loss": 0.0617,
"reward": 1.3807711601257324,
"reward_std": 0.30266276001930237,
"rewards/accuracy_reward": 0.8622448742389679,
"rewards/improved_len_reward_dast": 0.5185262858867645,
"step": 198
},
{
"completion_length": 1659.4744262695312,
"epoch": 0.5158781594296824,
"grad_norm": 0.2860111752617934,
"kl": 0.0122528076171875,
"learning_rate": 6.029932666150431e-07,
"loss": 0.0487,
"reward": 1.27889584004879,
"reward_std": 0.40974466502666473,
"rewards/accuracy_reward": 0.8010203987360001,
"rewards/improved_len_reward_dast": 0.4778754487633705,
"step": 199
},
{
"completion_length": 1553.6479187011719,
"epoch": 0.5184705119896306,
"grad_norm": 0.17284042761570728,
"kl": 0.0113372802734375,
"learning_rate": 5.989336821652029e-07,
"loss": -0.0157,
"reward": 1.292808324098587,
"reward_std": 0.3536081798374653,
"rewards/accuracy_reward": 0.7755101919174194,
"rewards/improved_len_reward_dast": 0.517298124730587,
"step": 200
},
{
"completion_length": 1221.6734313964844,
"epoch": 0.5210628645495787,
"grad_norm": 0.20576387898105802,
"kl": 0.00975799560546875,
"learning_rate": 5.948700635614745e-07,
"loss": 0.0155,
"reward": 1.043928012251854,
"reward_std": 0.5074506774544716,
"rewards/accuracy_reward": 0.734693855047226,
"rewards/improved_len_reward_dast": 0.3092341625597328,
"step": 201
},
{
"completion_length": 1443.3367156982422,
"epoch": 0.5236552171095269,
"grad_norm": 0.190656293014884,
"kl": 0.01007080078125,
"learning_rate": 5.908027458136518e-07,
"loss": 0.027,
"reward": 1.5769412517547607,
"reward_std": 0.27542993798851967,
"rewards/accuracy_reward": 0.9081632494926453,
"rewards/improved_len_reward_dast": 0.6687779873609543,
"step": 202
},
{
"completion_length": 1383.1325988769531,
"epoch": 0.5262475696694751,
"grad_norm": 0.18700146403961007,
"kl": 0.00789642333984375,
"learning_rate": 5.867320642364916e-07,
"loss": -0.0,
"reward": 1.4069096446037292,
"reward_std": 0.452865906059742,
"rewards/accuracy_reward": 0.8571428507566452,
"rewards/improved_len_reward_dast": 0.5497667863965034,
"step": 203
},
{
"completion_length": 1636.7448425292969,
"epoch": 0.5288399222294232,
"grad_norm": 0.18621798443065538,
"kl": 0.01001739501953125,
"learning_rate": 5.826583544220678e-07,
"loss": 0.0023,
"reward": 1.1149714589118958,
"reward_std": 0.5129830092191696,
"rewards/accuracy_reward": 0.739795908331871,
"rewards/improved_len_reward_dast": 0.3751755505800247,
"step": 204
},
{
"completion_length": 1296.4540252685547,
"epoch": 0.5314322747893714,
"grad_norm": 0.24973009441281563,
"kl": 0.00960540771484375,
"learning_rate": 5.78581952212107e-07,
"loss": 0.057,
"reward": 1.439581423997879,
"reward_std": 0.20332731679081917,
"rewards/accuracy_reward": 0.8775510191917419,
"rewards/improved_len_reward_dast": 0.5620303899049759,
"step": 205
},
{
"completion_length": 1675.2040405273438,
"epoch": 0.5340246273493195,
"grad_norm": 0.17994542833868402,
"kl": 0.0113983154296875,
"learning_rate": 5.745031936702997e-07,
"loss": 0.0212,
"reward": 1.236918032169342,
"reward_std": 0.4141309931874275,
"rewards/accuracy_reward": 0.7755101919174194,
"rewards/improved_len_reward_dast": 0.46140778064727783,
"step": 206
},
{
"completion_length": 1685.6376953125,
"epoch": 0.5366169799092677,
"grad_norm": 0.19387833193950482,
"kl": 0.0142364501953125,
"learning_rate": 5.704224150545956e-07,
"loss": 0.0032,
"reward": 1.1570499688386917,
"reward_std": 0.4146932289004326,
"rewards/accuracy_reward": 0.739795908331871,
"rewards/improved_len_reward_dast": 0.4172540530562401,
"step": 207
},
{
"completion_length": 1249.0101928710938,
"epoch": 0.5392093324692158,
"grad_norm": 0.1923070203823955,
"kl": 0.0085906982421875,
"learning_rate": 5.663399527894816e-07,
"loss": 0.0138,
"reward": 1.4272409826517105,
"reward_std": 0.34243838489055634,
"rewards/accuracy_reward": 0.8622448742389679,
"rewards/improved_len_reward_dast": 0.5649960786104202,
"step": 208
},
{
"completion_length": 1525.1734313964844,
"epoch": 0.5418016850291639,
"grad_norm": 0.19609225255735566,
"kl": 0.01036834716796875,
"learning_rate": 5.622561434382467e-07,
"loss": 0.0011,
"reward": 1.1873522847890854,
"reward_std": 0.4918947294354439,
"rewards/accuracy_reward": 0.8010203838348389,
"rewards/improved_len_reward_dast": 0.386331919580698,
"step": 209
},
{
"completion_length": 1988.4591064453125,
"epoch": 0.5443940375891121,
"grad_norm": 0.2322805815292897,
"kl": 0.0143280029296875,
"learning_rate": 5.581713236752361e-07,
"loss": 0.0289,
"reward": 1.1922202408313751,
"reward_std": 0.2860515546053648,
"rewards/accuracy_reward": 0.7244897782802582,
"rewards/improved_len_reward_dast": 0.46773041412234306,
"step": 210
},
{
"completion_length": 1433.290771484375,
"epoch": 0.5469863901490603,
"grad_norm": 0.2984688713886969,
"kl": 0.0114898681640625,
"learning_rate": 5.540858302580934e-07,
"loss": 0.0818,
"reward": 1.3492214977741241,
"reward_std": 0.3557019531726837,
"rewards/accuracy_reward": 0.8622448742389679,
"rewards/improved_len_reward_dast": 0.48697663098573685,
"step": 211
},
{
"completion_length": 1686.086669921875,
"epoch": 0.5495787427090084,
"grad_norm": 0.17323504261296585,
"kl": 0.01081085205078125,
"learning_rate": 5.5e-07,
"loss": -0.0227,
"reward": 0.910240039229393,
"reward_std": 0.49440842866897583,
"rewards/accuracy_reward": 0.6632653027772903,
"rewards/improved_len_reward_dast": 0.24697477743029594,
"step": 212
},
{
"completion_length": 1503.3571166992188,
"epoch": 0.5521710952689566,
"grad_norm": 0.19940687047680583,
"kl": 0.0108795166015625,
"learning_rate": 5.459141697419066e-07,
"loss": 0.0196,
"reward": 1.414816826581955,
"reward_std": 0.24907327815890312,
"rewards/accuracy_reward": 0.8622448742389679,
"rewards/improved_len_reward_dast": 0.5525719411671162,
"step": 213
},
{
"completion_length": 1326.4744720458984,
"epoch": 0.5547634478289047,
"grad_norm": 0.1968213437884411,
"kl": 0.00897216796875,
"learning_rate": 5.418286763247641e-07,
"loss": 0.0333,
"reward": 1.5710687637329102,
"reward_std": 0.27853039279580116,
"rewards/accuracy_reward": 0.9336734712123871,
"rewards/improved_len_reward_dast": 0.6373953074216843,
"step": 214
},
{
"completion_length": 1814.7856750488281,
"epoch": 0.5573558003888529,
"grad_norm": 0.1910754560182501,
"kl": 0.0157623291015625,
"learning_rate": 5.377438565617532e-07,
"loss": 0.0053,
"reward": 1.1130409240722656,
"reward_std": 0.5712603330612183,
"rewards/accuracy_reward": 0.7091836780309677,
"rewards/improved_len_reward_dast": 0.4038572832942009,
"step": 215
},
{
"completion_length": 2041.4693603515625,
"epoch": 0.5599481529488011,
"grad_norm": 0.19528431114703992,
"kl": 0.017974853515625,
"learning_rate": 5.336600472105186e-07,
"loss": 0.0026,
"reward": 1.1326239556074142,
"reward_std": 0.5115986987948418,
"rewards/accuracy_reward": 0.7193877249956131,
"rewards/improved_len_reward_dast": 0.41323617100715637,
"step": 216
},
{
"completion_length": 1490.438720703125,
"epoch": 0.5625405055087492,
"grad_norm": 0.1818395863982982,
"kl": 0.011444091796875,
"learning_rate": 5.295775849454045e-07,
"loss": -0.025,
"reward": 1.1338547468185425,
"reward_std": 0.26832524314522743,
"rewards/accuracy_reward": 0.75,
"rewards/improved_len_reward_dast": 0.3838547393679619,
"step": 217
},
{
"completion_length": 1993.8571166992188,
"epoch": 0.5651328580686974,
"grad_norm": 0.23754078779498058,
"kl": 0.0171356201171875,
"learning_rate": 5.254968063297003e-07,
"loss": -0.0245,
"reward": 1.088214099407196,
"reward_std": 0.33989886194467545,
"rewards/accuracy_reward": 0.6938775330781937,
"rewards/improved_len_reward_dast": 0.3943365402519703,
"step": 218
},
{
"completion_length": 1916.8775024414062,
"epoch": 0.5677252106286454,
"grad_norm": 0.23169329147427764,
"kl": 0.0146942138671875,
"learning_rate": 5.214180477878931e-07,
"loss": -0.0216,
"reward": 1.1535532772541046,
"reward_std": 0.5523173958063126,
"rewards/accuracy_reward": 0.739795908331871,
"rewards/improved_len_reward_dast": 0.4137573465704918,
"step": 219
},
{
"completion_length": 2072.586700439453,
"epoch": 0.5703175631885936,
"grad_norm": 0.179237513002948,
"kl": 0.0157623291015625,
"learning_rate": 5.173416455779323e-07,
"loss": 0.0061,
"reward": 1.129465639591217,
"reward_std": 0.47254087403416634,
"rewards/accuracy_reward": 0.7397958934307098,
"rewards/improved_len_reward_dast": 0.3896697536110878,
"step": 220
},
{
"completion_length": 1500.7499694824219,
"epoch": 0.5729099157485418,
"grad_norm": 0.18878843129064268,
"kl": 0.01107025146484375,
"learning_rate": 5.132679357635086e-07,
"loss": -0.0142,
"reward": 1.1763963997364044,
"reward_std": 0.48718392848968506,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.40598829090595245,
"step": 221
},
{
"completion_length": 1644.9030151367188,
"epoch": 0.5755022683084899,
"grad_norm": 0.17742073908553643,
"kl": 0.0126495361328125,
"learning_rate": 5.091972541863481e-07,
"loss": 0.0186,
"reward": 1.1986051201820374,
"reward_std": 0.4172977935522795,
"rewards/accuracy_reward": 0.734693855047226,
"rewards/improved_len_reward_dast": 0.463911272585392,
"step": 222
},
{
"completion_length": 1161.091812133789,
"epoch": 0.5780946208684381,
"grad_norm": 0.189357723748229,
"kl": 0.00917816162109375,
"learning_rate": 5.051299364385257e-07,
"loss": 0.0034,
"reward": 1.5119259655475616,
"reward_std": 0.34742674231529236,
"rewards/accuracy_reward": 0.9030611962080002,
"rewards/improved_len_reward_dast": 0.6088647544384003,
"step": 223
},
{
"completion_length": 2160.7142944335938,
"epoch": 0.5806869734283863,
"grad_norm": 0.1958816052872559,
"kl": 0.0196075439453125,
"learning_rate": 5.010663178347971e-07,
"loss": 0.0345,
"reward": 1.2357909381389618,
"reward_std": 0.4518684595823288,
"rewards/accuracy_reward": 0.7448979318141937,
"rewards/improved_len_reward_dast": 0.4908929914236069,
"step": 224
},
{
"completion_length": 1368.7703552246094,
"epoch": 0.5832793259883344,
"grad_norm": 0.2126816864157868,
"kl": 0.01153564453125,
"learning_rate": 4.970067333849568e-07,
"loss": 0.0421,
"reward": 1.3800954520702362,
"reward_std": 0.24764511361718178,
"rewards/accuracy_reward": 0.8163265287876129,
"rewards/improved_len_reward_dast": 0.5637688413262367,
"step": 225
},
{
"completion_length": 1523.7958984375,
"epoch": 0.5858716785482826,
"grad_norm": 0.2103498219912096,
"kl": 0.013336181640625,
"learning_rate": 4.929515177662182e-07,
"loss": 0.0336,
"reward": 1.3088043332099915,
"reward_std": 0.3938099816441536,
"rewards/accuracy_reward": 0.8214285671710968,
"rewards/improved_len_reward_dast": 0.48737573623657227,
"step": 226
},
{
"completion_length": 1753.9897766113281,
"epoch": 0.5884640311082308,
"grad_norm": 0.17623732882686455,
"kl": 0.0133514404296875,
"learning_rate": 4.889010052956233e-07,
"loss": 0.0184,
"reward": 1.1956195682287216,
"reward_std": 0.38174545764923096,
"rewards/accuracy_reward": 0.7551020234823227,
"rewards/improved_len_reward_dast": 0.44051752984523773,
"step": 227
},
{
"completion_length": 1186.4795837402344,
"epoch": 0.5910563836681789,
"grad_norm": 0.19103765244425439,
"kl": 0.00911712646484375,
"learning_rate": 4.848555299024798e-07,
"loss": -0.0025,
"reward": 1.3858640789985657,
"reward_std": 0.2998353075236082,
"rewards/accuracy_reward": 0.8724489808082581,
"rewards/improved_len_reward_dast": 0.5134151205420494,
"step": 228
},
{
"completion_length": 1717.0713806152344,
"epoch": 0.593648736228127,
"grad_norm": 0.1787260124676487,
"kl": 0.01560211181640625,
"learning_rate": 4.80815425100833e-07,
"loss": 0.0131,
"reward": 1.2940033674240112,
"reward_std": 0.3880784399807453,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.5031870305538177,
"step": 229
},
{
"completion_length": 1570.3979187011719,
"epoch": 0.5962410887880751,
"grad_norm": 0.1932563584259016,
"kl": 0.0125732421875,
"learning_rate": 4.7678102396196983e-07,
"loss": 0.0028,
"reward": 1.194681242108345,
"reward_std": 0.36879952996969223,
"rewards/accuracy_reward": 0.7704081386327744,
"rewards/improved_len_reward_dast": 0.4242731127887964,
"step": 230
},
{
"completion_length": 1627.1173400878906,
"epoch": 0.5988334413480233,
"grad_norm": 0.20069193255347081,
"kl": 0.01148223876953125,
"learning_rate": 4.727526590869605e-07,
"loss": -0.0024,
"reward": 1.2599404603242874,
"reward_std": 0.3717983737587929,
"rewards/accuracy_reward": 0.8061224222183228,
"rewards/improved_len_reward_dast": 0.45381802320480347,
"step": 231
},
{
"completion_length": 1422.693832397461,
"epoch": 0.6014257939079715,
"grad_norm": 0.22397903045763606,
"kl": 0.011993408203125,
"learning_rate": 4.6873066257923735e-07,
"loss": -0.0198,
"reward": 1.1824947893619537,
"reward_std": 0.3314864858984947,
"rewards/accuracy_reward": 0.7806122153997421,
"rewards/improved_len_reward_dast": 0.4018825590610504,
"step": 232
},
{
"completion_length": 2077.2550659179688,
"epoch": 0.6040181464679196,
"grad_norm": 0.2622807945246562,
"kl": 0.0151519775390625,
"learning_rate": 4.647153660172173e-07,
"loss": 0.0607,
"reward": 1.1635594964027405,
"reward_std": 0.392416313290596,
"rewards/accuracy_reward": 0.7499999701976776,
"rewards/improved_len_reward_dast": 0.4135594889521599,
"step": 233
},
{
"completion_length": 1738.4336547851562,
"epoch": 0.6066104990278678,
"grad_norm": 0.24814578097643056,
"kl": 0.01483917236328125,
"learning_rate": 4.607071004269647e-07,
"loss": 0.031,
"reward": 1.369605004787445,
"reward_std": 0.3843038082122803,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.5583804696798325,
"step": 234
},
{
"completion_length": 1602.0713806152344,
"epoch": 0.609202851587816,
"grad_norm": 0.2094489678458985,
"kl": 0.01458740234375,
"learning_rate": 4.567061962549025e-07,
"loss": -0.0277,
"reward": 1.1768890023231506,
"reward_std": 0.5075602382421494,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.4115828797221184,
"step": 235
},
{
"completion_length": 1883.586669921875,
"epoch": 0.6117952041477641,
"grad_norm": 0.18539849926073623,
"kl": 0.01873779296875,
"learning_rate": 4.527129833405687e-07,
"loss": 0.0234,
"reward": 1.2962508648633957,
"reward_std": 0.23112722299993038,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.5309447646141052,
"step": 236
},
{
"completion_length": 1541.188720703125,
"epoch": 0.6143875567077123,
"grad_norm": 0.2211580384146908,
"kl": 0.013671875,
"learning_rate": 4.4872779088942425e-07,
"loss": 0.027,
"reward": 1.3446270525455475,
"reward_std": 0.4020156227052212,
"rewards/accuracy_reward": 0.8265305906534195,
"rewards/improved_len_reward_dast": 0.5180964693427086,
"step": 237
},
{
"completion_length": 1877.1122131347656,
"epoch": 0.6169799092676604,
"grad_norm": 0.27937868976565,
"kl": 0.0175018310546875,
"learning_rate": 4.447509474457135e-07,
"loss": -0.0519,
"reward": 1.3078001737594604,
"reward_std": 0.3943771682679653,
"rewards/accuracy_reward": 0.811224490404129,
"rewards/improved_len_reward_dast": 0.49657563865184784,
"step": 238
},
{
"completion_length": 1735.6836547851562,
"epoch": 0.6195722618276086,
"grad_norm": 0.19004402096856263,
"kl": 0.013519287109375,
"learning_rate": 4.4078278086537823e-07,
"loss": 0.019,
"reward": 1.430199384689331,
"reward_std": 0.45470841974020004,
"rewards/accuracy_reward": 0.8418367207050323,
"rewards/improved_len_reward_dast": 0.5883626788854599,
"step": 239
},
{
"completion_length": 1290.8877258300781,
"epoch": 0.6221646143875567,
"grad_norm": 0.20039034607000805,
"kl": 0.00916290283203125,
"learning_rate": 4.3682361828902846e-07,
"loss": 0.0204,
"reward": 1.4429042339324951,
"reward_std": 0.40230638161301613,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.5857614576816559,
"step": 240
},
{
"completion_length": 1543.5713958740234,
"epoch": 0.6247569669475048,
"grad_norm": 0.1796128893155037,
"kl": 0.0121002197265625,
"learning_rate": 4.328737861149726e-07,
"loss": 0.0061,
"reward": 1.060480311512947,
"reward_std": 0.4090285710990429,
"rewards/accuracy_reward": 0.7040816247463226,
"rewards/improved_len_reward_dast": 0.35639870166778564,
"step": 241
},
{
"completion_length": 1650.6581420898438,
"epoch": 0.627349319507453,
"grad_norm": 0.17035045538288204,
"kl": 0.0127410888671875,
"learning_rate": 4.289336099723098e-07,
"loss": -0.0068,
"reward": 1.2868027091026306,
"reward_std": 0.4846101552248001,
"rewards/accuracy_reward": 0.795918345451355,
"rewards/improved_len_reward_dast": 0.49088432639837265,
"step": 242
},
{
"completion_length": 1806.8724212646484,
"epoch": 0.6299416720674011,
"grad_norm": 0.21153725027052578,
"kl": 0.01531982421875,
"learning_rate": 4.250034146940834e-07,
"loss": 0.0342,
"reward": 1.3773571997880936,
"reward_std": 0.32580330967903137,
"rewards/accuracy_reward": 0.8265305906534195,
"rewards/improved_len_reward_dast": 0.5508265644311905,
"step": 243
},
{
"completion_length": 1506.8877410888672,
"epoch": 0.6325340246273493,
"grad_norm": 0.20274200364313702,
"kl": 0.01300048828125,
"learning_rate": 4.210835242905023e-07,
"loss": 0.0114,
"reward": 1.3944001197814941,
"reward_std": 0.35993905924260616,
"rewards/accuracy_reward": 0.867346927523613,
"rewards/improved_len_reward_dast": 0.5270532071590424,
"step": 244
},
{
"completion_length": 1694.5713806152344,
"epoch": 0.6351263771872975,
"grad_norm": 0.20631633070295144,
"kl": 0.01531982421875,
"learning_rate": 4.1717426192222784e-07,
"loss": 0.001,
"reward": 1.269565299153328,
"reward_std": 0.3799453191459179,
"rewards/accuracy_reward": 0.7908162921667099,
"rewards/improved_len_reward_dast": 0.4787489101290703,
"step": 245
},
{
"completion_length": 2018.9642028808594,
"epoch": 0.6377187297472456,
"grad_norm": 0.23377044647625822,
"kl": 0.01549530029296875,
"learning_rate": 4.1327594987373347e-07,
"loss": 0.0057,
"reward": 0.9710913375020027,
"reward_std": 0.4150635525584221,
"rewards/accuracy_reward": 0.6479591578245163,
"rewards/improved_len_reward_dast": 0.3231321321800351,
"step": 246
},
{
"completion_length": 1953.44384765625,
"epoch": 0.6403110823071938,
"grad_norm": 0.18922091960973522,
"kl": 0.0152740478515625,
"learning_rate": 4.0938890952673443e-07,
"loss": -0.0073,
"reward": 1.144493117928505,
"reward_std": 0.326381828635931,
"rewards/accuracy_reward": 0.6989795714616776,
"rewards/improved_len_reward_dast": 0.445513516664505,
"step": 247
},
{
"completion_length": 1779.9234771728516,
"epoch": 0.642903434867142,
"grad_norm": 0.19009690153217312,
"kl": 0.01587677001953125,
"learning_rate": 4.05513461333693e-07,
"loss": 0.0056,
"reward": 1.2144882082939148,
"reward_std": 0.3660648465156555,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.45428410917520523,
"step": 248
},
{
"completion_length": 1680.5816040039062,
"epoch": 0.6454957874270901,
"grad_norm": 0.18737871436935236,
"kl": 0.01519775390625,
"learning_rate": 4.016499247913994e-07,
"loss": 0.0155,
"reward": 1.228882908821106,
"reward_std": 0.42849814891815186,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.4584747403860092,
"step": 249
},
{
"completion_length": 1700.0765075683594,
"epoch": 0.6480881399870383,
"grad_norm": 0.19083582747427946,
"kl": 0.01373291015625,
"learning_rate": 3.977986184146328e-07,
"loss": 0.0276,
"reward": 1.4491282403469086,
"reward_std": 0.29963432252407074,
"rewards/accuracy_reward": 0.8469387590885162,
"rewards/improved_len_reward_dast": 0.6021894812583923,
"step": 250
},
{
"completion_length": 1699.5050964355469,
"epoch": 0.6506804925469863,
"grad_norm": 0.18294974628895902,
"kl": 0.01318359375,
"learning_rate": 3.939598597099022e-07,
"loss": -0.0028,
"reward": 1.1291119307279587,
"reward_std": 0.4640827924013138,
"rewards/accuracy_reward": 0.7499999850988388,
"rewards/improved_len_reward_dast": 0.3791119046509266,
"step": 251
},
{
"completion_length": 1555.9489135742188,
"epoch": 0.6532728451069345,
"grad_norm": 0.2987585035266382,
"kl": 0.013702392578125,
"learning_rate": 3.9013396514927076e-07,
"loss": -0.0182,
"reward": 1.2567480206489563,
"reward_std": 0.38375869020819664,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.4710337221622467,
"step": 252
},
{
"completion_length": 2022.5509643554688,
"epoch": 0.6558651976668827,
"grad_norm": 0.16778625708063813,
"kl": 0.0160064697265625,
"learning_rate": 3.8632125014426566e-07,
"loss": 0.0026,
"reward": 1.0748438835144043,
"reward_std": 0.3207223527133465,
"rewards/accuracy_reward": 0.6836734712123871,
"rewards/improved_len_reward_dast": 0.3911704570055008,
"step": 253
},
{
"completion_length": 2008.7550659179688,
"epoch": 0.6584575502268308,
"grad_norm": 0.20081517128616475,
"kl": 0.017364501953125,
"learning_rate": 3.8252202901987474e-07,
"loss": -0.0036,
"reward": 1.1095408350229263,
"reward_std": 0.42732013761997223,
"rewards/accuracy_reward": 0.7193877398967743,
"rewards/improved_len_reward_dast": 0.39015308022499084,
"step": 254
},
{
"completion_length": 1753.5305786132812,
"epoch": 0.661049902786779,
"grad_norm": 0.19286213527020518,
"kl": 0.015838623046875,
"learning_rate": 3.7873661498863384e-07,
"loss": -0.0193,
"reward": 1.3401989042758942,
"reward_std": 0.44482723623514175,
"rewards/accuracy_reward": 0.8367346823215485,
"rewards/improved_len_reward_dast": 0.5034642219543457,
"step": 255
},
{
"completion_length": 1714.8316040039062,
"epoch": 0.6636422553467272,
"grad_norm": 0.19098352531749854,
"kl": 0.015716552734375,
"learning_rate": 3.7496532012480463e-07,
"loss": -0.0172,
"reward": 1.285597413778305,
"reward_std": 0.3779995068907738,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.4947810471057892,
"step": 256
},
{
"completion_length": 1587.0254821777344,
"epoch": 0.6662346079066753,
"grad_norm": 0.1828164836366847,
"kl": 0.01513671875,
"learning_rate": 3.7120845533864706e-07,
"loss": 0.0165,
"reward": 1.2909784018993378,
"reward_std": 0.3537175990641117,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.5001621246337891,
"step": 257
},
{
"completion_length": 1945.8519897460938,
"epoch": 0.6688269604666235,
"grad_norm": 0.2401064586242113,
"kl": 0.018310546875,
"learning_rate": 3.6746633035078723e-07,
"loss": -0.0254,
"reward": 0.9318393021821976,
"reward_std": 0.3634992204606533,
"rewards/accuracy_reward": 0.6530612260103226,
"rewards/improved_len_reward_dast": 0.2787781246006489,
"step": 258
},
{
"completion_length": 1464.5356903076172,
"epoch": 0.6714193130265717,
"grad_norm": 0.19897550034047456,
"kl": 0.0117645263671875,
"learning_rate": 3.63739253666684e-07,
"loss": 0.0257,
"reward": 1.3326016068458557,
"reward_std": 0.25891564041376114,
"rewards/accuracy_reward": 0.8469387590885162,
"rewards/improved_len_reward_dast": 0.48566286638379097,
"step": 259
},
{
"completion_length": 2040.6173095703125,
"epoch": 0.6740116655865198,
"grad_norm": 0.2093225075876704,
"kl": 0.01587677001953125,
"learning_rate": 3.6002753255119533e-07,
"loss": 0.0446,
"reward": 1.1549495160579681,
"reward_std": 0.6060752719640732,
"rewards/accuracy_reward": 0.7295918166637421,
"rewards/improved_len_reward_dast": 0.42535772174596786,
"step": 260
},
{
"completion_length": 1504.892837524414,
"epoch": 0.6766040181464679,
"grad_norm": 0.2413238757963301,
"kl": 0.013092041015625,
"learning_rate": 3.5633147300324706e-07,
"loss": 0.039,
"reward": 1.3253722488880157,
"reward_std": 0.22303567081689835,
"rewards/accuracy_reward": 0.7755101919174194,
"rewards/improved_len_reward_dast": 0.5498620271682739,
"step": 261
},
{
"completion_length": 1835.6020202636719,
"epoch": 0.679196370706416,
"grad_norm": 0.1742605810963208,
"kl": 0.0152587890625,
"learning_rate": 3.526513797306051e-07,
"loss": 0.023,
"reward": 1.3810910284519196,
"reward_std": 0.3878571353852749,
"rewards/accuracy_reward": 0.8469387590885162,
"rewards/improved_len_reward_dast": 0.5341522693634033,
"step": 262
},
{
"completion_length": 1934.44384765625,
"epoch": 0.6817887232663642,
"grad_norm": 0.18402016017590034,
"kl": 0.0189971923828125,
"learning_rate": 3.489875561247568e-07,
"loss": 0.0326,
"reward": 1.1064758449792862,
"reward_std": 0.5427646264433861,
"rewards/accuracy_reward": 0.75,
"rewards/improved_len_reward_dast": 0.3564758636057377,
"step": 263
},
{
"completion_length": 1527.6479187011719,
"epoch": 0.6843810758263124,
"grad_norm": 0.2535051321853217,
"kl": 0.0133209228515625,
"learning_rate": 3.453403042358968e-07,
"loss": 0.0594,
"reward": 1.3837721645832062,
"reward_std": 0.3384307250380516,
"rewards/accuracy_reward": 0.8571428507566452,
"rewards/improved_len_reward_dast": 0.5266292989253998,
"step": 264
},
{
"completion_length": 1750.1275329589844,
"epoch": 0.6869734283862605,
"grad_norm": 0.20005193883523226,
"kl": 0.014312744140625,
"learning_rate": 3.417099247480277e-07,
"loss": 0.0069,
"reward": 1.1163494735956192,
"reward_std": 0.4810503050684929,
"rewards/accuracy_reward": 0.7295918166637421,
"rewards/improved_len_reward_dast": 0.3867576252669096,
"step": 265
},
{
"completion_length": 1910.5254821777344,
"epoch": 0.6895657809462087,
"grad_norm": 0.3018048627256463,
"kl": 0.0156402587890625,
"learning_rate": 3.3809671695416916e-07,
"loss": 0.0357,
"reward": 1.147754654288292,
"reward_std": 0.5025169178843498,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.3824485056102276,
"step": 266
},
{
"completion_length": 1284.0663146972656,
"epoch": 0.6921581335061568,
"grad_norm": 0.18258330323366856,
"kl": 0.0092926025390625,
"learning_rate": 3.345009787316859e-07,
"loss": 0.0015,
"reward": 1.4202894866466522,
"reward_std": 0.2870555892586708,
"rewards/accuracy_reward": 0.8418367058038712,
"rewards/improved_len_reward_dast": 0.5784527361392975,
"step": 267
},
{
"completion_length": 1557.5612030029297,
"epoch": 0.694750486066105,
"grad_norm": 0.1849700340313966,
"kl": 0.012725830078125,
"learning_rate": 3.309230065177289e-07,
"loss": -0.0079,
"reward": 1.4877441823482513,
"reward_std": 0.302555400878191,
"rewards/accuracy_reward": 0.8622448742389679,
"rewards/improved_len_reward_dast": 0.6254993677139282,
"step": 268
},
{
"completion_length": 1482.5203552246094,
"epoch": 0.6973428386260532,
"grad_norm": 0.19171071001803489,
"kl": 0.0144500732421875,
"learning_rate": 3.273630952847971e-07,
"loss": -0.0012,
"reward": 1.2047373950481415,
"reward_std": 0.48537394404411316,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.4445333182811737,
"step": 269
},
{
"completion_length": 1744.6070861816406,
"epoch": 0.6999351911860013,
"grad_norm": 0.17132128213246742,
"kl": 0.01513671875,
"learning_rate": 3.2382153851641996e-07,
"loss": 0.0229,
"reward": 1.1097373962402344,
"reward_std": 0.2911606300622225,
"rewards/accuracy_reward": 0.7295918166637421,
"rewards/improved_len_reward_dast": 0.38014551997184753,
"step": 270
},
{
"completion_length": 1705.5968933105469,
"epoch": 0.7025275437459495,
"grad_norm": 0.2582533948663525,
"kl": 0.01708984375,
"learning_rate": 3.202986281829616e-07,
"loss": 0.045,
"reward": 1.3047520220279694,
"reward_std": 0.4435114786028862,
"rewards/accuracy_reward": 0.8061224222183228,
"rewards/improved_len_reward_dast": 0.4986295886337757,
"step": 271
},
{
"completion_length": 1806.591796875,
"epoch": 0.7051198963058976,
"grad_norm": 0.17993615347196873,
"kl": 0.01581573486328125,
"learning_rate": 3.1679465471755106e-07,
"loss": 0.016,
"reward": 1.2005809843540192,
"reward_std": 0.2893667705357075,
"rewards/accuracy_reward": 0.7448979467153549,
"rewards/improved_len_reward_dast": 0.45568302273750305,
"step": 272
},
{
"completion_length": 1960.2244262695312,
"epoch": 0.7077122488658457,
"grad_norm": 0.21394731890393012,
"kl": 0.018402099609375,
"learning_rate": 3.1330990699213824e-07,
"loss": 0.0026,
"reward": 1.3150149285793304,
"reward_std": 0.32834067940711975,
"rewards/accuracy_reward": 0.7602040469646454,
"rewards/improved_len_reward_dast": 0.5548108592629433,
"step": 273
},
{
"completion_length": 1648.7601623535156,
"epoch": 0.7103046014257939,
"grad_norm": 0.22677843577967902,
"kl": 0.0144500732421875,
"learning_rate": 3.0984467229367885e-07,
"loss": -0.0289,
"reward": 1.186056673526764,
"reward_std": 0.3048909828066826,
"rewards/accuracy_reward": 0.7653061002492905,
"rewards/improved_len_reward_dast": 0.42075058072805405,
"step": 274
},
{
"completion_length": 1631.3876953125,
"epoch": 0.712896953985742,
"grad_norm": 0.18075852179231652,
"kl": 0.0135955810546875,
"learning_rate": 3.063992363004503e-07,
"loss": -0.0047,
"reward": 1.3900758624076843,
"reward_std": 0.35281531512737274,
"rewards/accuracy_reward": 0.8163264989852905,
"rewards/improved_len_reward_dast": 0.5737493187189102,
"step": 275
},
{
"completion_length": 1794.5203857421875,
"epoch": 0.7154893065456902,
"grad_norm": 0.20597152512904204,
"kl": 0.0141143798828125,
"learning_rate": 3.0297388305850004e-07,
"loss": 0.0135,
"reward": 1.2308696657419205,
"reward_std": 0.3947853706777096,
"rewards/accuracy_reward": 0.7959183603525162,
"rewards/improved_len_reward_dast": 0.434951264411211,
"step": 276
},
{
"completion_length": 1608.892822265625,
"epoch": 0.7180816591056384,
"grad_norm": 0.22201185510570046,
"kl": 0.0151519775390625,
"learning_rate": 2.9956889495822877e-07,
"loss": 0.0463,
"reward": 1.3714110851287842,
"reward_std": 0.41973991319537163,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.549982562661171,
"step": 277
},
{
"completion_length": 1833.0203552246094,
"epoch": 0.7206740116655865,
"grad_norm": 0.18677648497687657,
"kl": 0.0153656005859375,
"learning_rate": 2.961845527111091e-07,
"loss": 0.0087,
"reward": 1.1960042417049408,
"reward_std": 0.35424697771668434,
"rewards/accuracy_reward": 0.7499999850988388,
"rewards/improved_len_reward_dast": 0.4460042342543602,
"step": 278
},
{
"completion_length": 1663.1989440917969,
"epoch": 0.7232663642255347,
"grad_norm": 0.23408313686800128,
"kl": 0.0152435302734375,
"learning_rate": 2.9282113532654363e-07,
"loss": 0.0496,
"reward": 1.2954119145870209,
"reward_std": 0.4828920140862465,
"rewards/accuracy_reward": 0.8265306055545807,
"rewards/improved_len_reward_dast": 0.46888134628534317,
"step": 279
},
{
"completion_length": 1693.0254974365234,
"epoch": 0.7258587167854829,
"grad_norm": 0.23913668563173046,
"kl": 0.019439697265625,
"learning_rate": 2.894789200888634e-07,
"loss": 0.0174,
"reward": 1.4143796861171722,
"reward_std": 0.37724653631448746,
"rewards/accuracy_reward": 0.8367346674203873,
"rewards/improved_len_reward_dast": 0.5776450335979462,
"step": 280
},
{
"completion_length": 1277.8468780517578,
"epoch": 0.728451069345431,
"grad_norm": 0.2694215840510146,
"kl": 0.0134429931640625,
"learning_rate": 2.8615818253446766e-07,
"loss": 0.0046,
"reward": 1.4540930390357971,
"reward_std": 0.3243625983595848,
"rewards/accuracy_reward": 0.8775509893894196,
"rewards/improved_len_reward_dast": 0.5765420496463776,
"step": 281
},
{
"completion_length": 1236.0356903076172,
"epoch": 0.7310434219053791,
"grad_norm": 0.1871177689494516,
"kl": 0.0116729736328125,
"learning_rate": 2.828591964291093e-07,
"loss": 0.0055,
"reward": 1.2881307899951935,
"reward_std": 0.42027105391025543,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.466702226549387,
"step": 282
},
{
"completion_length": 1389.3673095703125,
"epoch": 0.7336357744653272,
"grad_norm": 0.17949852486745174,
"kl": 0.0106201171875,
"learning_rate": 2.7958223374532363e-07,
"loss": -0.029,
"reward": 1.2979092001914978,
"reward_std": 0.34224472381174564,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.4407663494348526,
"step": 283
},
{
"completion_length": 1291.64794921875,
"epoch": 0.7362281270252754,
"grad_norm": 0.20498717449578613,
"kl": 0.01025390625,
"learning_rate": 2.7632756464000835e-07,
"loss": 0.0333,
"reward": 1.6148460805416107,
"reward_std": 0.25412340462207794,
"rewards/accuracy_reward": 0.9234693795442581,
"rewards/improved_len_reward_dast": 0.6913766860961914,
"step": 284
},
{
"completion_length": 1941.4284973144531,
"epoch": 0.7388204795852236,
"grad_norm": 0.19896247201933293,
"kl": 0.019378662109375,
"learning_rate": 2.730954574321503e-07,
"loss": 0.0303,
"reward": 1.0792112797498703,
"reward_std": 0.38586486876010895,
"rewards/accuracy_reward": 0.7142857015132904,
"rewards/improved_len_reward_dast": 0.3649255894124508,
"step": 285
},
{
"completion_length": 1503.8826446533203,
"epoch": 0.7414128321451717,
"grad_norm": 0.22350544706234096,
"kl": 0.01275634765625,
"learning_rate": 2.698861785807055e-07,
"loss": 0.0311,
"reward": 1.5651328265666962,
"reward_std": 0.3553974963724613,
"rewards/accuracy_reward": 0.9030612260103226,
"rewards/improved_len_reward_dast": 0.6620715856552124,
"step": 286
},
{
"completion_length": 1731.8214111328125,
"epoch": 0.7440051847051199,
"grad_norm": 0.23609281842069962,
"kl": 0.0157470703125,
"learning_rate": 2.6669999266263154e-07,
"loss": -0.0306,
"reward": 1.1723814904689789,
"reward_std": 0.5022178217768669,
"rewards/accuracy_reward": 0.7602040767669678,
"rewards/improved_len_reward_dast": 0.41217736527323723,
"step": 287
},
{
"completion_length": 1870.0458679199219,
"epoch": 0.7465975372650681,
"grad_norm": 0.15632978700328695,
"kl": 0.0158843994140625,
"learning_rate": 2.635371623510758e-07,
"loss": 0.0204,
"reward": 1.0800221413373947,
"reward_std": 0.2878151945769787,
"rewards/accuracy_reward": 0.6887754872441292,
"rewards/improved_len_reward_dast": 0.39124663546681404,
"step": 288
},
{
"completion_length": 1414.2703552246094,
"epoch": 0.7491898898250162,
"grad_norm": 0.23286966119816113,
"kl": 0.0133056640625,
"learning_rate": 2.6039794839372066e-07,
"loss": -0.0074,
"reward": 1.341863602399826,
"reward_std": 0.36198627576231956,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.530639074742794,
"step": 289
},
{
"completion_length": 1749.2295532226562,
"epoch": 0.7517822423849644,
"grad_norm": 0.17241966258758817,
"kl": 0.0135955810546875,
"learning_rate": 2.5728260959128614e-07,
"loss": -0.0129,
"reward": 1.2213443964719772,
"reward_std": 0.4387034922838211,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.46114034205675125,
"step": 290
},
{
"completion_length": 2126.826446533203,
"epoch": 0.7543745949449125,
"grad_norm": 0.2030042278234921,
"kl": 0.018890380859375,
"learning_rate": 2.541914027761951e-07,
"loss": 0.0435,
"reward": 1.1566181033849716,
"reward_std": 0.505137488245964,
"rewards/accuracy_reward": 0.7244897782802582,
"rewards/improved_len_reward_dast": 0.43212827295064926,
"step": 291
},
{
"completion_length": 1632.0713653564453,
"epoch": 0.7569669475048607,
"grad_norm": 0.24718377241844533,
"kl": 0.016876220703125,
"learning_rate": 2.511245827913991e-07,
"loss": 0.0421,
"reward": 1.2267541885375977,
"reward_std": 0.3394501358270645,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.4563460126519203,
"step": 292
},
{
"completion_length": 1807.6529846191406,
"epoch": 0.7595593000648088,
"grad_norm": 0.1861047697263272,
"kl": 0.01556396484375,
"learning_rate": 2.4808240246936866e-07,
"loss": -0.0078,
"reward": 1.2387667298316956,
"reward_std": 0.4819525480270386,
"rewards/accuracy_reward": 0.795918345451355,
"rewards/improved_len_reward_dast": 0.44284842535853386,
"step": 293
},
{
"completion_length": 1847.19384765625,
"epoch": 0.7621516526247569,
"grad_norm": 0.22670935044930915,
"kl": 0.018310546875,
"learning_rate": 2.450651126112504e-07,
"loss": 0.0266,
"reward": 1.4322427809238434,
"reward_std": 0.2754583992063999,
"rewards/accuracy_reward": 0.8418367058038712,
"rewards/improved_len_reward_dast": 0.590406060218811,
"step": 294
},
{
"completion_length": 1595.9795532226562,
"epoch": 0.7647440051847051,
"grad_norm": 0.20527730505286215,
"kl": 0.015838623046875,
"learning_rate": 2.4207296196618924e-07,
"loss": 0.0242,
"reward": 1.3626587092876434,
"reward_std": 0.32539451494812965,
"rewards/accuracy_reward": 0.7908162921667099,
"rewards/improved_len_reward_dast": 0.5718424171209335,
"step": 295
},
{
"completion_length": 1054.137710571289,
"epoch": 0.7673363577446533,
"grad_norm": 0.21493362850187817,
"kl": 0.0093536376953125,
"learning_rate": 2.3910619721082253e-07,
"loss": 0.0196,
"reward": 1.4152240753173828,
"reward_std": 0.35989922285079956,
"rewards/accuracy_reward": 0.867346927523613,
"rewards/improved_len_reward_dast": 0.5478771775960922,
"step": 296
},
{
"completion_length": 1474.3367004394531,
"epoch": 0.7699287103046014,
"grad_norm": 0.20358206304391516,
"kl": 0.0144500732421875,
"learning_rate": 2.3616506292894282e-07,
"loss": 0.0271,
"reward": 1.4626062214374542,
"reward_std": 0.29278943687677383,
"rewards/accuracy_reward": 0.8775510042905807,
"rewards/improved_len_reward_dast": 0.5850552245974541,
"step": 297
},
{
"completion_length": 1752.2295227050781,
"epoch": 0.7725210628645496,
"grad_norm": 0.1833066106969091,
"kl": 0.015289306640625,
"learning_rate": 2.332498015913344e-07,
"loss": 0.0009,
"reward": 1.3457911014556885,
"reward_std": 0.2773626856505871,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.5345666632056236,
"step": 298
},
{
"completion_length": 1325.688720703125,
"epoch": 0.7751134154244977,
"grad_norm": 0.19517765602950424,
"kl": 0.01210784912109375,
"learning_rate": 2.303606535357843e-07,
"loss": 0.0599,
"reward": 1.5037426948547363,
"reward_std": 0.26091703958809376,
"rewards/accuracy_reward": 0.8775510191917419,
"rewards/improved_len_reward_dast": 0.6261917278170586,
"step": 299
},
{
"completion_length": 1663.0662689208984,
"epoch": 0.7777057679844459,
"grad_norm": 0.20601240191104908,
"kl": 0.01605224609375,
"learning_rate": 2.2749785694726685e-07,
"loss": 0.0094,
"reward": 1.3560754358768463,
"reward_std": 0.37762896716594696,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.5346468687057495,
"step": 300
},
{
"completion_length": 1426.6173095703125,
"epoch": 0.7802981205443941,
"grad_norm": 0.20108821286385423,
"kl": 0.0143585205078125,
"learning_rate": 2.2466164783830972e-07,
"loss": 0.0207,
"reward": 1.3399082869291306,
"reward_std": 0.3976980447769165,
"rewards/accuracy_reward": 0.806122437119484,
"rewards/improved_len_reward_dast": 0.5337858349084854,
"step": 301
},
{
"completion_length": 1790.8978881835938,
"epoch": 0.7828904731043422,
"grad_norm": 0.21383459811515595,
"kl": 0.0155029296875,
"learning_rate": 2.2185226002953483e-07,
"loss": 0.0004,
"reward": 1.2710506618022919,
"reward_std": 0.3618534617125988,
"rewards/accuracy_reward": 0.785714253783226,
"rewards/improved_len_reward_dast": 0.4853363707661629,
"step": 302
},
{
"completion_length": 1939.8775024414062,
"epoch": 0.7854828256642904,
"grad_norm": 0.29379980912133363,
"kl": 0.01885986328125,
"learning_rate": 2.1906992513038268e-07,
"loss": 0.0479,
"reward": 1.2805213034152985,
"reward_std": 0.4143086224794388,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.4692968502640724,
"step": 303
},
{
"completion_length": 1614.3775329589844,
"epoch": 0.7880751782242384,
"grad_norm": 0.17729210448855,
"kl": 0.0162353515625,
"learning_rate": 2.1631487252001822e-07,
"loss": 0.0049,
"reward": 1.234568029642105,
"reward_std": 0.417904369533062,
"rewards/accuracy_reward": 0.8010203838348389,
"rewards/improved_len_reward_dast": 0.43354763835668564,
"step": 304
},
{
"completion_length": 2287.780548095703,
"epoch": 0.7906675307841866,
"grad_norm": 1.2242934021255432,
"kl": 0.021087646484375,
"learning_rate": 2.1358732932842032e-07,
"loss": 0.0211,
"reward": 1.0315402448177338,
"reward_std": 0.36217188835144043,
"rewards/accuracy_reward": 0.6581632494926453,
"rewards/improved_len_reward_dast": 0.3733769580721855,
"step": 305
},
{
"completion_length": 1723.3673400878906,
"epoch": 0.7932598833441348,
"grad_norm": 0.20686736211065535,
"kl": 0.015533447265625,
"learning_rate": 2.1088752041765734e-07,
"loss": 0.0319,
"reward": 1.3500191867351532,
"reward_std": 0.3599831163883209,
"rewards/accuracy_reward": 0.8061224222183228,
"rewards/improved_len_reward_dast": 0.5438967421650887,
"step": 306
},
{
"completion_length": 1528.9183654785156,
"epoch": 0.7958522359040829,
"grad_norm": 0.21573348295043995,
"kl": 0.015960693359375,
"learning_rate": 2.0821566836334847e-07,
"loss": -0.0098,
"reward": 1.3639625310897827,
"reward_std": 0.3467046692967415,
"rewards/accuracy_reward": 0.8469387590885162,
"rewards/improved_len_reward_dast": 0.5170237571001053,
"step": 307
},
{
"completion_length": 1429.280532836914,
"epoch": 0.7984445884640311,
"grad_norm": 0.18304725042811948,
"kl": 0.01262664794921875,
"learning_rate": 2.0557199343631494e-07,
"loss": 0.0087,
"reward": 1.2729185968637466,
"reward_std": 0.37279824167490005,
"rewards/accuracy_reward": 0.8061224520206451,
"rewards/improved_len_reward_dast": 0.4667961820960045,
"step": 308
},
{
"completion_length": 1876.0458679199219,
"epoch": 0.8010369410239793,
"grad_norm": 0.20278131778947003,
"kl": 0.01853179931640625,
"learning_rate": 2.0295671358442033e-07,
"loss": 0.019,
"reward": 1.3648760467767715,
"reward_std": 0.3640540838241577,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.5536516159772873,
"step": 309
},
{
"completion_length": 1463.239730834961,
"epoch": 0.8036292935839274,
"grad_norm": 0.22793846718497435,
"kl": 0.014312744140625,
"learning_rate": 2.0037004441460263e-07,
"loss": 0.0287,
"reward": 1.3905141055583954,
"reward_std": 0.41797252371907234,
"rewards/accuracy_reward": 0.8418367207050323,
"rewards/improved_len_reward_dast": 0.5486774370074272,
"step": 310
},
{
"completion_length": 1581.4999542236328,
"epoch": 0.8062216461438756,
"grad_norm": 0.2080094216762287,
"kl": 0.01576995849609375,
"learning_rate": 1.9781219917509987e-07,
"loss": 0.0138,
"reward": 1.4025911092758179,
"reward_std": 0.3261520601809025,
"rewards/accuracy_reward": 0.8265306055545807,
"rewards/improved_len_reward_dast": 0.5760605186223984,
"step": 311
},
{
"completion_length": 1737.1019897460938,
"epoch": 0.8088139987038238,
"grad_norm": 0.22193491426249878,
"kl": 0.0164794921875,
"learning_rate": 1.9528338873786882e-07,
"loss": 0.0217,
"reward": 1.1316132843494415,
"reward_std": 0.44266829639673233,
"rewards/accuracy_reward": 0.7397959157824516,
"rewards/improved_len_reward_dast": 0.39181735552847385,
"step": 312
},
{
"completion_length": 1681.6224060058594,
"epoch": 0.8114063512637719,
"grad_norm": 0.21692033379747663,
"kl": 0.0162506103515625,
"learning_rate": 1.9278382158120116e-07,
"loss": 0.0256,
"reward": 1.2757752537727356,
"reward_std": 0.447167094796896,
"rewards/accuracy_reward": 0.795918345451355,
"rewards/improved_len_reward_dast": 0.4798569083213806,
"step": 313
},
{
"completion_length": 1513.8316040039062,
"epoch": 0.81399870382372,
"grad_norm": 0.18130741669805844,
"kl": 0.01153564453125,
"learning_rate": 1.9031370377253574e-07,
"loss": 0.0246,
"reward": 1.535945862531662,
"reward_std": 0.31188252195715904,
"rewards/accuracy_reward": 0.8826530426740646,
"rewards/improved_len_reward_dast": 0.653292790055275,
"step": 314
},
{
"completion_length": 1734.6632385253906,
"epoch": 0.8165910563836681,
"grad_norm": 0.18939277983218827,
"kl": 0.0179443359375,
"learning_rate": 1.8787323895147052e-07,
"loss": -0.001,
"reward": 1.1586688458919525,
"reward_std": 0.4217538684606552,
"rewards/accuracy_reward": 0.7551020234823227,
"rewards/improved_len_reward_dast": 0.4035668522119522,
"step": 315
},
{
"completion_length": 1650.4846496582031,
"epoch": 0.8191834089436163,
"grad_norm": 0.2171448495391751,
"kl": 0.0167999267578125,
"learning_rate": 1.8546262831297438e-07,
"loss": -0.0121,
"reward": 1.464043915271759,
"reward_std": 0.3952450007200241,
"rewards/accuracy_reward": 0.8724489510059357,
"rewards/improved_len_reward_dast": 0.5915949791669846,
"step": 316
},
{
"completion_length": 1495.3316040039062,
"epoch": 0.8217757615035645,
"grad_norm": 0.19836205451789388,
"kl": 0.0137481689453125,
"learning_rate": 1.8308207059079938e-07,
"loss": -0.0069,
"reward": 1.1547789573669434,
"reward_std": 0.41507500410079956,
"rewards/accuracy_reward": 0.7704081386327744,
"rewards/improved_len_reward_dast": 0.3843708522617817,
"step": 317
},
{
"completion_length": 1517.8367004394531,
"epoch": 0.8243681140635126,
"grad_norm": 0.20600261332668526,
"kl": 0.0160064697265625,
"learning_rate": 1.8073176204109837e-07,
"loss": 0.0437,
"reward": 1.438821941614151,
"reward_std": 0.306551206856966,
"rewards/accuracy_reward": 0.8775510042905807,
"rewards/improved_len_reward_dast": 0.5612709149718285,
"step": 318
},
{
"completion_length": 1504.4285278320312,
"epoch": 0.8269604666234608,
"grad_norm": 0.21261278084781152,
"kl": 0.014495849609375,
"learning_rate": 1.7841189642624428e-07,
"loss": 0.0231,
"reward": 1.229389488697052,
"reward_std": 0.4350128807127476,
"rewards/accuracy_reward": 0.7959183603525162,
"rewards/improved_len_reward_dast": 0.4334711404517293,
"step": 319
},
{
"completion_length": 1672.8316040039062,
"epoch": 0.829552819183409,
"grad_norm": 0.1943882700904058,
"kl": 0.0173492431640625,
"learning_rate": 1.7612266499885642e-07,
"loss": 0.0464,
"reward": 1.5176236629486084,
"reward_std": 0.3366955704987049,
"rewards/accuracy_reward": 0.8877550810575485,
"rewards/improved_len_reward_dast": 0.6298686116933823,
"step": 320
},
{
"completion_length": 1179.0713653564453,
"epoch": 0.8321451717433571,
"grad_norm": 0.22615060777330476,
"kl": 0.012054443359375,
"learning_rate": 1.7386425648603354e-07,
"loss": 0.0423,
"reward": 1.5581437051296234,
"reward_std": 0.234028534963727,
"rewards/accuracy_reward": 0.8979591578245163,
"rewards/improved_len_reward_dast": 0.6601845473051071,
"step": 321
},
{
"completion_length": 1385.7346649169922,
"epoch": 0.8347375243033053,
"grad_norm": 0.18647668905538498,
"kl": 0.0132293701171875,
"learning_rate": 1.716368570737946e-07,
"loss": -0.0176,
"reward": 1.5387031435966492,
"reward_std": 0.39274929463863373,
"rewards/accuracy_reward": 0.9081632643938065,
"rewards/improved_len_reward_dast": 0.6305398866534233,
"step": 322
},
{
"completion_length": 1955.0357055664062,
"epoch": 0.8373298768632534,
"grad_norm": 0.1871384863519405,
"kl": 0.01862335205078125,
"learning_rate": 1.6944065039173004e-07,
"loss": 0.0282,
"reward": 0.9992491155862808,
"reward_std": 0.4749828167259693,
"rewards/accuracy_reward": 0.6785714030265808,
"rewards/improved_len_reward_dast": 0.3206777200102806,
"step": 323
},
{
"completion_length": 1949.9693298339844,
"epoch": 0.8399222294232016,
"grad_norm": 0.20078422959231634,
"kl": 0.020111083984375,
"learning_rate": 1.672758174978622e-07,
"loss": 0.0315,
"reward": 1.227005422115326,
"reward_std": 0.36194342374801636,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.46680130809545517,
"step": 324
},
{
"completion_length": 1403.64794921875,
"epoch": 0.8425145819831497,
"grad_norm": 0.20565437549884577,
"kl": 0.0128936767578125,
"learning_rate": 1.6514253686371917e-07,
"loss": 0.0204,
"reward": 1.4708826392889023,
"reward_std": 0.2500988617539406,
"rewards/accuracy_reward": 0.8826530426740646,
"rewards/improved_len_reward_dast": 0.5882296115159988,
"step": 325
},
{
"completion_length": 1667.8264770507812,
"epoch": 0.8451069345430978,
"grad_norm": 0.21813136540877595,
"kl": 0.0157318115234375,
"learning_rate": 1.630409843596216e-07,
"loss": 0.0307,
"reward": 1.3411798775196075,
"reward_std": 0.32134104520082474,
"rewards/accuracy_reward": 0.8061224222183228,
"rewards/improved_len_reward_dast": 0.53505739569664,
"step": 326
},
{
"completion_length": 1616.2908020019531,
"epoch": 0.847699287103046,
"grad_norm": 0.1969183257495155,
"kl": 0.0156402587890625,
"learning_rate": 1.609713332401831e-07,
"loss": 0.0085,
"reward": 1.2519380450248718,
"reward_std": 0.458795890212059,
"rewards/accuracy_reward": 0.7806122452020645,
"rewards/improved_len_reward_dast": 0.4713258519768715,
"step": 327
},
{
"completion_length": 1625.6377258300781,
"epoch": 0.8502916396629941,
"grad_norm": 0.24417535965250406,
"kl": 0.0139617919921875,
"learning_rate": 1.5893375413002765e-07,
"loss": -0.0317,
"reward": 1.2513196468353271,
"reward_std": 0.47703811526298523,
"rewards/accuracy_reward": 0.7704081386327744,
"rewards/improved_len_reward_dast": 0.4809115380048752,
"step": 328
},
{
"completion_length": 2058.948944091797,
"epoch": 0.8528839922229423,
"grad_norm": 0.19451912015501954,
"kl": 0.0210418701171875,
"learning_rate": 1.569284150097226e-07,
"loss": 0.0377,
"reward": 1.2445521801710129,
"reward_std": 0.26459160074591637,
"rewards/accuracy_reward": 0.7295918315649033,
"rewards/improved_len_reward_dast": 0.5149602852761745,
"step": 329
},
{
"completion_length": 1789.7040405273438,
"epoch": 0.8554763447828905,
"grad_norm": 0.24266903278771249,
"kl": 0.019378662109375,
"learning_rate": 1.5495548120193003e-07,
"loss": 0.0434,
"reward": 1.322462946176529,
"reward_std": 0.38080430775880814,
"rewards/accuracy_reward": 0.8265305906534195,
"rewards/improved_len_reward_dast": 0.49593234062194824,
"step": 330
},
{
"completion_length": 1468.8213653564453,
"epoch": 0.8580686973428386,
"grad_norm": 0.1945755306885796,
"kl": 0.01294708251953125,
"learning_rate": 1.5301511535777784e-07,
"loss": 0.0302,
"reward": 1.5070666372776031,
"reward_std": 0.3562978059053421,
"rewards/accuracy_reward": 0.8724489510059357,
"rewards/improved_len_reward_dast": 0.6346177160739899,
"step": 331
},
{
"completion_length": 1581.3825988769531,
"epoch": 0.8606610499027868,
"grad_norm": 0.29272858693831433,
"kl": 0.01812744140625,
"learning_rate": 1.5110747744345006e-07,
"loss": 0.0122,
"reward": 1.3418152332305908,
"reward_std": 0.4640466570854187,
"rewards/accuracy_reward": 0.8724489659070969,
"rewards/improved_len_reward_dast": 0.46936625242233276,
"step": 332
},
{
"completion_length": 1786.1734313964844,
"epoch": 0.863253402462735,
"grad_norm": 0.19480551857525122,
"kl": 0.019775390625,
"learning_rate": 1.4923272472699986e-07,
"loss": -0.0042,
"reward": 1.1590133309364319,
"reward_std": 0.2618263028562069,
"rewards/accuracy_reward": 0.7193877398967743,
"rewards/improved_len_reward_dast": 0.4396255351603031,
"step": 333
},
{
"completion_length": 1171.147933959961,
"epoch": 0.8658457550226831,
"grad_norm": 0.23814232802014945,
"kl": 0.013671875,
"learning_rate": 1.4739101176538274e-07,
"loss": 0.0174,
"reward": 1.2705652117729187,
"reward_std": 0.3895917683839798,
"rewards/accuracy_reward": 0.8367346823215485,
"rewards/improved_len_reward_dast": 0.43383053690195084,
"step": 334
},
{
"completion_length": 1758.0816040039062,
"epoch": 0.8684381075826313,
"grad_norm": 0.22764969968005389,
"kl": 0.0219268798828125,
"learning_rate": 1.4558249039171639e-07,
"loss": 0.0414,
"reward": 1.358829528093338,
"reward_std": 0.38345643877983093,
"rewards/accuracy_reward": 0.8367346823215485,
"rewards/improved_len_reward_dast": 0.5220948457717896,
"step": 335
},
{
"completion_length": 1889.0509948730469,
"epoch": 0.8710304601425793,
"grad_norm": 0.22895792507657853,
"kl": 0.021484375,
"learning_rate": 1.4380730970276195e-07,
"loss": 0.0354,
"reward": 1.07760888338089,
"reward_std": 0.3665538318455219,
"rewards/accuracy_reward": 0.6887754797935486,
"rewards/improved_len_reward_dast": 0.3888333588838577,
"step": 336
},
{
"completion_length": 2373.249969482422,
"epoch": 0.8736228127025275,
"grad_norm": 0.2697468121522664,
"kl": 0.026397705078125,
"learning_rate": 1.420656160466333e-07,
"loss": -0.0102,
"reward": 1.0278730392456055,
"reward_std": 0.348503515124321,
"rewards/accuracy_reward": 0.6938775330781937,
"rewards/improved_len_reward_dast": 0.33399548195302486,
"step": 337
},
{
"completion_length": 1981.8978881835938,
"epoch": 0.8762151652624757,
"grad_norm": 0.20587316419649823,
"kl": 0.0223846435546875,
"learning_rate": 1.4035755301073102e-07,
"loss": 0.0273,
"reward": 1.2939772605895996,
"reward_std": 0.46924955397844315,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.5286711901426315,
"step": 338
},
{
"completion_length": 1536.4336395263672,
"epoch": 0.8788075178224238,
"grad_norm": 0.20611627730954438,
"kl": 0.0202789306640625,
"learning_rate": 1.386832614099056e-07,
"loss": 0.006,
"reward": 1.4531451165676117,
"reward_std": 0.3475269414484501,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.5960022807121277,
"step": 339
},
{
"completion_length": 1489.7652435302734,
"epoch": 0.881399870382372,
"grad_norm": 0.2223037836334228,
"kl": 0.0159454345703125,
"learning_rate": 1.3704287927484846e-07,
"loss": -0.0138,
"reward": 1.3403507471084595,
"reward_std": 0.46086446195840836,
"rewards/accuracy_reward": 0.8112244606018066,
"rewards/improved_len_reward_dast": 0.529126301407814,
"step": 340
},
{
"completion_length": 1788.7091674804688,
"epoch": 0.8839922229423202,
"grad_norm": 0.188880858513302,
"kl": 0.0198516845703125,
"learning_rate": 1.3543654184071186e-07,
"loss": 0.0144,
"reward": 1.320367306470871,
"reward_std": 0.2726456895470619,
"rewards/accuracy_reward": 0.7755101919174194,
"rewards/improved_len_reward_dast": 0.5448571220040321,
"step": 341
},
{
"completion_length": 1541.3316192626953,
"epoch": 0.8865845755022683,
"grad_norm": 0.20649364949795315,
"kl": 0.01570892333984375,
"learning_rate": 1.3386438153596067e-07,
"loss": 0.0104,
"reward": 1.327652782201767,
"reward_std": 0.3968999646604061,
"rewards/accuracy_reward": 0.846938744187355,
"rewards/improved_len_reward_dast": 0.4807140678167343,
"step": 342
},
{
"completion_length": 1504.8775329589844,
"epoch": 0.8891769280622165,
"grad_norm": 0.23748978746970162,
"kl": 0.0181427001953125,
"learning_rate": 1.323265279714543e-07,
"loss": -0.0172,
"reward": 1.3229451477527618,
"reward_std": 0.38034195080399513,
"rewards/accuracy_reward": 0.8265306055545807,
"rewards/improved_len_reward_dast": 0.49641457200050354,
"step": 343
},
{
"completion_length": 1616.14794921875,
"epoch": 0.8917692806221647,
"grad_norm": 0.228900632017236,
"kl": 0.020263671875,
"learning_rate": 1.3082310792976202e-07,
"loss": 0.0331,
"reward": 1.4383951127529144,
"reward_std": 0.32518207281827927,
"rewards/accuracy_reward": 0.8520407974720001,
"rewards/improved_len_reward_dast": 0.5863542854785919,
"step": 344
},
{
"completion_length": 1765.0509948730469,
"epoch": 0.8943616331821128,
"grad_norm": 0.21689615981919957,
"kl": 0.0205841064453125,
"learning_rate": 1.293542453547102e-07,
"loss": 0.0219,
"reward": 1.3277872800827026,
"reward_std": 0.4930282086133957,
"rewards/accuracy_reward": 0.8163264989852905,
"rewards/improved_len_reward_dast": 0.5114607587456703,
"step": 345
},
{
"completion_length": 1576.6071166992188,
"epoch": 0.8969539857420609,
"grad_norm": 0.2503011086919002,
"kl": 0.0197906494140625,
"learning_rate": 1.279200613411642e-07,
"loss": 0.044,
"reward": 1.2905025482177734,
"reward_std": 0.47432298958301544,
"rewards/accuracy_reward": 0.8214285522699356,
"rewards/improved_len_reward_dast": 0.46907395869493484,
"step": 346
},
{
"completion_length": 2153.3162231445312,
"epoch": 0.899546338302009,
"grad_norm": 0.23273243697852358,
"kl": 0.023712158203125,
"learning_rate": 1.2652067412504605e-07,
"loss": 0.0312,
"reward": 1.047543928027153,
"reward_std": 0.3953222408890724,
"rewards/accuracy_reward": 0.688775509595871,
"rewards/improved_len_reward_dast": 0.35876838117837906,
"step": 347
},
{
"completion_length": 1542.3111877441406,
"epoch": 0.9021386908619572,
"grad_norm": 0.25879665856811085,
"kl": 0.0159149169921875,
"learning_rate": 1.251561990735859e-07,
"loss": 0.0306,
"reward": 1.4665509164333344,
"reward_std": 0.34583452716469765,
"rewards/accuracy_reward": 0.867346927523613,
"rewards/improved_len_reward_dast": 0.599203959107399,
"step": 348
},
{
"completion_length": 2166.5713806152344,
"epoch": 0.9047310434219054,
"grad_norm": 0.21742881103681694,
"kl": 0.029144287109375,
"learning_rate": 1.238267486758117e-07,
"loss": 0.0221,
"reward": 0.9765184819698334,
"reward_std": 0.4072360023856163,
"rewards/accuracy_reward": 0.6224489733576775,
"rewards/improved_len_reward_dast": 0.3540695160627365,
"step": 349
},
{
"completion_length": 1897.44384765625,
"epoch": 0.9073233959818535,
"grad_norm": 0.20381019828760852,
"kl": 0.022857666015625,
"learning_rate": 1.2253243253327504e-07,
"loss": 0.0392,
"reward": 1.2360577583312988,
"reward_std": 0.4647463858127594,
"rewards/accuracy_reward": 0.7653061151504517,
"rewards/improved_len_reward_dast": 0.470751591026783,
"step": 350
},
{
"completion_length": 1563.9234313964844,
"epoch": 0.9099157485418017,
"grad_norm": 0.2149667100915999,
"kl": 0.01705169677734375,
"learning_rate": 1.212733573510154e-07,
"loss": 0.0251,
"reward": 1.484131395816803,
"reward_std": 0.3115840032696724,
"rewards/accuracy_reward": 0.867346927523613,
"rewards/improved_len_reward_dast": 0.6167844533920288,
"step": 351
},
{
"completion_length": 1613.438720703125,
"epoch": 0.9125081011017498,
"grad_norm": 0.2397808119710266,
"kl": 0.01849365234375,
"learning_rate": 1.20049626928764e-07,
"loss": 0.0255,
"reward": 1.374268501996994,
"reward_std": 0.3617161624133587,
"rewards/accuracy_reward": 0.8163264989852905,
"rewards/improved_len_reward_dast": 0.5579419583082199,
"step": 352
},
{
"completion_length": 1810.8724060058594,
"epoch": 0.915100453661698,
"grad_norm": 0.1952032672447838,
"kl": 0.0240478515625,
"learning_rate": 1.1886134215238539e-07,
"loss": 0.0013,
"reward": 1.2345272898674011,
"reward_std": 0.4293368086218834,
"rewards/accuracy_reward": 0.7602040618658066,
"rewards/improved_len_reward_dast": 0.47432321310043335,
"step": 353
},
{
"completion_length": 1323.6071319580078,
"epoch": 0.9176928062216462,
"grad_norm": 0.23544630425662993,
"kl": 0.0150299072265625,
"learning_rate": 1.1770860098556122e-07,
"loss": -0.0126,
"reward": 1.5638253688812256,
"reward_std": 0.3317151963710785,
"rewards/accuracy_reward": 0.9234693795442581,
"rewards/improved_len_reward_dast": 0.6403559893369675,
"step": 354
},
{
"completion_length": 1648.1122436523438,
"epoch": 0.9202851587815943,
"grad_norm": 0.19373617697957926,
"kl": 0.01983642578125,
"learning_rate": 1.1659149846171314e-07,
"loss": -0.0106,
"reward": 1.409626692533493,
"reward_std": 0.3634777031838894,
"rewards/accuracy_reward": 0.8112244606018066,
"rewards/improved_len_reward_dast": 0.5984021797776222,
"step": 355
},
{
"completion_length": 1640.484634399414,
"epoch": 0.9228775113415425,
"grad_norm": 0.2139648005259324,
"kl": 0.02065277099609375,
"learning_rate": 1.1551012667616889e-07,
"loss": -0.0041,
"reward": 1.3790205717086792,
"reward_std": 0.3004123643040657,
"rewards/accuracy_reward": 0.8010203987360001,
"rewards/improved_len_reward_dast": 0.5780001431703568,
"step": 356
},
{
"completion_length": 1952.6427612304688,
"epoch": 0.9254698639014906,
"grad_norm": 0.20207361431898127,
"kl": 0.027069091796875,
"learning_rate": 1.1446457477856933e-07,
"loss": 0.0274,
"reward": 1.1954913437366486,
"reward_std": 0.30133310705423355,
"rewards/accuracy_reward": 0.7448979467153549,
"rewards/improved_len_reward_dast": 0.450593464076519,
"step": 357
},
{
"completion_length": 1666.0816040039062,
"epoch": 0.9280622164614387,
"grad_norm": 0.2020263485504787,
"kl": 0.0185546875,
"learning_rate": 1.1345492896551908e-07,
"loss": -0.0157,
"reward": 1.4352505505084991,
"reward_std": 0.4688113033771515,
"rewards/accuracy_reward": 0.8928571343421936,
"rewards/improved_len_reward_dast": 0.542393408715725,
"step": 358
},
{
"completion_length": 1809.0611877441406,
"epoch": 0.9306545690213869,
"grad_norm": 0.2096938589768357,
"kl": 0.020904541015625,
"learning_rate": 1.1248127247348025e-07,
"loss": 0.0384,
"reward": 1.3605789840221405,
"reward_std": 0.35709768906235695,
"rewards/accuracy_reward": 0.8163264989852905,
"rewards/improved_len_reward_dast": 0.544252522289753,
"step": 359
},
{
"completion_length": 1797.744857788086,
"epoch": 0.933246921581335,
"grad_norm": 0.21622133027589538,
"kl": 0.02146148681640625,
"learning_rate": 1.1154368557191032e-07,
"loss": 0.0154,
"reward": 1.0935336202383041,
"reward_std": 0.3505462594330311,
"rewards/accuracy_reward": 0.6938775479793549,
"rewards/improved_len_reward_dast": 0.3996560573577881,
"step": 360
},
{
"completion_length": 1433.0765075683594,
"epoch": 0.9358392741412832,
"grad_norm": 0.22187489868295793,
"kl": 0.0160064697265625,
"learning_rate": 1.1064224555664489e-07,
"loss": -0.0178,
"reward": 1.2581793367862701,
"reward_std": 0.4055371508002281,
"rewards/accuracy_reward": 0.806122437119484,
"rewards/improved_len_reward_dast": 0.4520568624138832,
"step": 361
},
{
"completion_length": 1678.2703857421875,
"epoch": 0.9384316267012314,
"grad_norm": 0.18769832722230134,
"kl": 0.0196075439453125,
"learning_rate": 1.0977702674352485e-07,
"loss": 0.0061,
"reward": 1.533081442117691,
"reward_std": 0.24393456988036633,
"rewards/accuracy_reward": 0.8673469126224518,
"rewards/improved_len_reward_dast": 0.6657344847917557,
"step": 362
},
{
"completion_length": 1496.3112030029297,
"epoch": 0.9410239792611795,
"grad_norm": 0.2409591218430649,
"kl": 0.01830291748046875,
"learning_rate": 1.0894810046227007e-07,
"loss": 0.0454,
"reward": 1.3800479769706726,
"reward_std": 0.3536526523530483,
"rewards/accuracy_reward": 0.8316326439380646,
"rewards/improved_len_reward_dast": 0.548415370285511,
"step": 363
},
{
"completion_length": 1296.9234313964844,
"epoch": 0.9436163318211277,
"grad_norm": 0.2065960957661233,
"kl": 0.014404296875,
"learning_rate": 1.0815553505059864e-07,
"loss": 0.0346,
"reward": 1.4174171388149261,
"reward_std": 0.3700226917862892,
"rewards/accuracy_reward": 0.8673469126224518,
"rewards/improved_len_reward_dast": 0.5500702187418938,
"step": 364
},
{
"completion_length": 1770.8111572265625,
"epoch": 0.9462086843810759,
"grad_norm": 0.22025176867987864,
"kl": 0.0205535888671875,
"learning_rate": 1.0739939584859327e-07,
"loss": 0.0372,
"reward": 1.2784855961799622,
"reward_std": 0.40080468729138374,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.4876692369580269,
"step": 365
},
{
"completion_length": 2252.9540405273438,
"epoch": 0.948801036941024,
"grad_norm": 0.25202994466231426,
"kl": 0.028900146484375,
"learning_rate": 1.066797451933144e-07,
"loss": 0.0538,
"reward": 1.052029862999916,
"reward_std": 0.4297824278473854,
"rewards/accuracy_reward": 0.6734693944454193,
"rewards/improved_len_reward_dast": 0.37856047973036766,
"step": 366
},
{
"completion_length": 1675.0867309570312,
"epoch": 0.9513933895009722,
"grad_norm": 0.18981437618840255,
"kl": 0.019775390625,
"learning_rate": 1.0599664241366108e-07,
"loss": 0.0215,
"reward": 1.4016070365905762,
"reward_std": 0.4491507261991501,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.5444641783833504,
"step": 367
},
{
"completion_length": 2051.3162536621094,
"epoch": 0.9539857420609202,
"grad_norm": 0.18988751309956323,
"kl": 0.0218658447265625,
"learning_rate": 1.0535014382547976e-07,
"loss": -0.0024,
"reward": 1.3321772515773773,
"reward_std": 0.5532524138689041,
"rewards/accuracy_reward": 0.8418367207050323,
"rewards/improved_len_reward_dast": 0.4903404861688614,
"step": 368
},
{
"completion_length": 1725.3927917480469,
"epoch": 0.9565780946208684,
"grad_norm": 0.26332331622328803,
"kl": 0.02056884765625,
"learning_rate": 1.0474030272692176e-07,
"loss": -0.0428,
"reward": 1.1207705438137054,
"reward_std": 0.582356795668602,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.33505629003047943,
"step": 369
},
{
"completion_length": 1730.3264465332031,
"epoch": 0.9591704471808166,
"grad_norm": 0.23147600575876767,
"kl": 0.020355224609375,
"learning_rate": 1.0416716939404906e-07,
"loss": 0.0207,
"reward": 1.4236516058444977,
"reward_std": 0.4436470791697502,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.5665087997913361,
"step": 370
},
{
"completion_length": 2078.234649658203,
"epoch": 0.9617627997407647,
"grad_norm": 0.18318392619509644,
"kl": 0.02490234375,
"learning_rate": 1.0363079107668965e-07,
"loss": 0.0174,
"reward": 1.2476365268230438,
"reward_std": 0.4425313174724579,
"rewards/accuracy_reward": 0.7704081535339355,
"rewards/improved_len_reward_dast": 0.4772283583879471,
"step": 371
},
{
"completion_length": 1901.7754821777344,
"epoch": 0.9643551523007129,
"grad_norm": 0.2045058157665467,
"kl": 0.0230865478515625,
"learning_rate": 1.03131211994542e-07,
"loss": 0.0151,
"reward": 1.1136702597141266,
"reward_std": 0.4208161160349846,
"rewards/accuracy_reward": 0.6989795863628387,
"rewards/improved_len_reward_dast": 0.41469068080186844,
"step": 372
},
{
"completion_length": 1673.6377563476562,
"epoch": 0.9669475048606611,
"grad_norm": 0.1953573582384899,
"kl": 0.0203399658203125,
"learning_rate": 1.0266847333352986e-07,
"loss": 0.0144,
"reward": 1.2215417325496674,
"reward_std": 0.3687748461961746,
"rewards/accuracy_reward": 0.8061224222183228,
"rewards/improved_len_reward_dast": 0.4154192693531513,
"step": 373
},
{
"completion_length": 1465.4744262695312,
"epoch": 0.9695398574206092,
"grad_norm": 0.2392315039852379,
"kl": 0.020263671875,
"learning_rate": 1.022426132424064e-07,
"loss": 0.0264,
"reward": 1.3526732623577118,
"reward_std": 0.2864141073077917,
"rewards/accuracy_reward": 0.8418367058038712,
"rewards/improved_len_reward_dast": 0.5108365193009377,
"step": 374
},
{
"completion_length": 1698.5611877441406,
"epoch": 0.9721322099805574,
"grad_norm": 0.22243506530923526,
"kl": 0.018157958984375,
"learning_rate": 1.0185366682960968e-07,
"loss": 0.0368,
"reward": 1.2421083450317383,
"reward_std": 0.3934044614434242,
"rewards/accuracy_reward": 0.7908163070678711,
"rewards/improved_len_reward_dast": 0.451292023062706,
"step": 375
},
{
"completion_length": 1694.5101623535156,
"epoch": 0.9747245625405055,
"grad_norm": 0.2049483563870167,
"kl": 0.02301025390625,
"learning_rate": 1.015016661603677e-07,
"loss": 0.0109,
"reward": 1.2675099819898605,
"reward_std": 0.27898336201906204,
"rewards/accuracy_reward": 0.806122437119484,
"rewards/improved_len_reward_dast": 0.4613875336945057,
"step": 376
},
{
"completion_length": 1818.9183349609375,
"epoch": 0.9773169151004537,
"grad_norm": 0.2917301156280802,
"kl": 0.022247314453125,
"learning_rate": 1.011866402540555e-07,
"loss": 0.052,
"reward": 1.2979410141706467,
"reward_std": 0.4051199574023485,
"rewards/accuracy_reward": 0.8010203987360001,
"rewards/improved_len_reward_dast": 0.4969206303358078,
"step": 377
},
{
"completion_length": 1676.4030151367188,
"epoch": 0.9799092676604018,
"grad_norm": 0.19999847167358073,
"kl": 0.0189666748046875,
"learning_rate": 1.0090861508180229e-07,
"loss": 0.0173,
"reward": 1.307900682091713,
"reward_std": 0.36051470041275024,
"rewards/accuracy_reward": 0.806122437119484,
"rewards/improved_len_reward_dast": 0.5017782524228096,
"step": 378
},
{
"completion_length": 1303.3468933105469,
"epoch": 0.9825016202203499,
"grad_norm": 0.23002851272315084,
"kl": 0.016387939453125,
"learning_rate": 1.006676135643506e-07,
"loss": 0.0223,
"reward": 1.5040651261806488,
"reward_std": 0.28981203213334084,
"rewards/accuracy_reward": 0.8877550810575485,
"rewards/improved_len_reward_dast": 0.6163100153207779,
"step": 379
},
{
"completion_length": 1699.98974609375,
"epoch": 0.9850939727802981,
"grad_norm": 0.2773167363062717,
"kl": 0.021759033203125,
"learning_rate": 1.004636555701666e-07,
"loss": -0.0024,
"reward": 1.3300544768571854,
"reward_std": 0.4332263544201851,
"rewards/accuracy_reward": 0.857142835855484,
"rewards/improved_len_reward_dast": 0.47291168570518494,
"step": 380
},
{
"completion_length": 2158.5560607910156,
"epoch": 0.9876863253402463,
"grad_norm": 0.19893298725270195,
"kl": 0.027099609375,
"learning_rate": 1.0029675791380211e-07,
"loss": 0.0245,
"reward": 1.366698831319809,
"reward_std": 0.3425176590681076,
"rewards/accuracy_reward": 0.8112244755029678,
"rewards/improved_len_reward_dast": 0.5554743856191635,
"step": 381
},
{
"completion_length": 1771.0765075683594,
"epoch": 0.9902786779001944,
"grad_norm": 0.21454331685840108,
"kl": 0.025909423828125,
"learning_rate": 1.0016693435450846e-07,
"loss": 0.0522,
"reward": 1.1434401869773865,
"reward_std": 0.518133670091629,
"rewards/accuracy_reward": 0.7448979467153549,
"rewards/improved_len_reward_dast": 0.39854224771261215,
"step": 382
},
{
"completion_length": 1916.8673095703125,
"epoch": 0.9928710304601426,
"grad_norm": 0.21868762838968606,
"kl": 0.0216217041015625,
"learning_rate": 1.00074195595102e-07,
"loss": 0.0149,
"reward": 1.2855271100997925,
"reward_std": 0.4449741840362549,
"rewards/accuracy_reward": 0.7857142686843872,
"rewards/improved_len_reward_dast": 0.4998128265142441,
"step": 383
},
{
"completion_length": 1359.0254821777344,
"epoch": 0.9954633830200907,
"grad_norm": 0.22146763439588837,
"kl": 0.01685333251953125,
"learning_rate": 1.0001854928108199e-07,
"loss": -0.0267,
"reward": 1.3678375780582428,
"reward_std": 0.3422878012061119,
"rewards/accuracy_reward": 0.8214285671710968,
"rewards/improved_len_reward_dast": 0.5464089959859848,
"step": 384
},
{
"completion_length": 1564.7193908691406,
"epoch": 0.9980557355800389,
"grad_norm": 0.29725903676415294,
"kl": 0.019683837890625,
"learning_rate": 1e-07,
"loss": 0.0597,
"reward": 1.2890927195549011,
"reward_std": 0.3781392499804497,
"rewards/accuracy_reward": 0.795918345451355,
"rewards/improved_len_reward_dast": 0.49317440390586853,
"step": 385
},
{
"epoch": 0.9980557355800389,
"step": 385,
"total_flos": 0.0,
"train_loss": 0.0015093988140246698,
"train_runtime": 5817.5821,
"train_samples_per_second": 1.856,
"train_steps_per_second": 0.066
}
],
"logging_steps": 1,
"max_steps": 385,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 7,
"trial_name": null,
"trial_params": null
}