Kadins's picture
Model save
e9cf7ae verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9980557355800389,
"eval_steps": 500,
"global_step": 385,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1870.551025390625,
"epoch": 0.002592352559948153,
"grad_norm": 0.13448497653007507,
"kl": 0.0,
"learning_rate": 2.564102564102564e-08,
"loss": 0.022,
"reward": 0.5185521692037582,
"reward_std": 0.3244118466973305,
"rewards/improved_len_reward_dast": 0.5185521692037582,
"step": 1
},
{
"completion_length": 2074.5535583496094,
"epoch": 0.005184705119896306,
"grad_norm": 0.11098560690879822,
"kl": 0.0,
"learning_rate": 5.128205128205128e-08,
"loss": 0.0251,
"reward": 0.32963134348392487,
"reward_std": 0.28946176916360855,
"rewards/improved_len_reward_dast": 0.32963134348392487,
"step": 2
},
{
"completion_length": 1990.8341674804688,
"epoch": 0.007777057679844459,
"grad_norm": 0.1158740445971489,
"kl": 0.00014102458953857422,
"learning_rate": 7.692307692307692e-08,
"loss": -0.0241,
"reward": 0.26494530215859413,
"reward_std": 0.3063320405781269,
"rewards/improved_len_reward_dast": 0.26494530215859413,
"step": 3
},
{
"completion_length": 2150.165802001953,
"epoch": 0.010369410239792612,
"grad_norm": 0.10702688992023468,
"kl": 0.00012409687042236328,
"learning_rate": 1.0256410256410256e-07,
"loss": -0.0137,
"reward": 0.456451453268528,
"reward_std": 0.3154432289302349,
"rewards/improved_len_reward_dast": 0.456451453268528,
"step": 4
},
{
"completion_length": 1848.1632385253906,
"epoch": 0.012961762799740765,
"grad_norm": 0.12357146292924881,
"kl": 0.0001302957534790039,
"learning_rate": 1.2820512820512818e-07,
"loss": 0.028,
"reward": 0.49856673181056976,
"reward_std": 0.2522367388010025,
"rewards/improved_len_reward_dast": 0.49856673181056976,
"step": 5
},
{
"completion_length": 2126.336700439453,
"epoch": 0.015554115359688918,
"grad_norm": 0.13506007194519043,
"kl": 0.00012934207916259766,
"learning_rate": 1.5384615384615385e-07,
"loss": 0.0393,
"reward": 0.4235878065228462,
"reward_std": 0.25951137393713,
"rewards/improved_len_reward_dast": 0.4235878065228462,
"step": 6
},
{
"completion_length": 1832.3622131347656,
"epoch": 0.01814646791963707,
"grad_norm": 0.1261880099773407,
"kl": 0.00011241436004638672,
"learning_rate": 1.7948717948717948e-07,
"loss": 0.0262,
"reward": 0.3710284195840359,
"reward_std": 0.2790074981749058,
"rewards/improved_len_reward_dast": 0.3710284195840359,
"step": 7
},
{
"completion_length": 1735.2193603515625,
"epoch": 0.020738820479585224,
"grad_norm": 0.12955217063426971,
"kl": 0.00010597705841064453,
"learning_rate": 2.0512820512820512e-07,
"loss": 0.0141,
"reward": 0.4706665948033333,
"reward_std": 0.2832951880991459,
"rewards/improved_len_reward_dast": 0.4706665948033333,
"step": 8
},
{
"completion_length": 2113.0381774902344,
"epoch": 0.023331173039533377,
"grad_norm": 0.12865294516086578,
"kl": 0.0001271963119506836,
"learning_rate": 2.3076923076923078e-07,
"loss": 0.0338,
"reward": 0.35827554017305374,
"reward_std": 0.29027409106492996,
"rewards/improved_len_reward_dast": 0.35827554017305374,
"step": 9
},
{
"completion_length": 1956.9030456542969,
"epoch": 0.02592352559948153,
"grad_norm": 0.13909928500652313,
"kl": 0.00012624263763427734,
"learning_rate": 2.5641025641025636e-07,
"loss": 0.0336,
"reward": 0.3675283007323742,
"reward_std": 0.2691008448600769,
"rewards/improved_len_reward_dast": 0.3675283007323742,
"step": 10
},
{
"completion_length": 2262.3163146972656,
"epoch": 0.028515878159429683,
"grad_norm": 0.12856782972812653,
"kl": 0.0001437664031982422,
"learning_rate": 2.8205128205128203e-07,
"loss": 0.017,
"reward": 0.31318413466215134,
"reward_std": 0.3111809715628624,
"rewards/improved_len_reward_dast": 0.31318413466215134,
"step": 11
},
{
"completion_length": 2018.7933349609375,
"epoch": 0.031108230719377836,
"grad_norm": 0.12054255604743958,
"kl": 0.0001354217529296875,
"learning_rate": 3.076923076923077e-07,
"loss": -0.0047,
"reward": 0.3627483192831278,
"reward_std": 0.3026025593280792,
"rewards/improved_len_reward_dast": 0.3627483192831278,
"step": 12
},
{
"completion_length": 1993.1734008789062,
"epoch": 0.033700583279325985,
"grad_norm": 0.13207760453224182,
"kl": 0.00013363361358642578,
"learning_rate": 3.333333333333333e-07,
"loss": 0.01,
"reward": 0.371895145624876,
"reward_std": 0.26758549362421036,
"rewards/improved_len_reward_dast": 0.371895145624876,
"step": 13
},
{
"completion_length": 2101.9234313964844,
"epoch": 0.03629293583927414,
"grad_norm": 0.13171768188476562,
"kl": 0.00011819601058959961,
"learning_rate": 3.5897435897435896e-07,
"loss": 0.0455,
"reward": 0.39824650436639786,
"reward_std": 0.23699114099144936,
"rewards/improved_len_reward_dast": 0.39824650436639786,
"step": 14
},
{
"completion_length": 1458.2933349609375,
"epoch": 0.03888528839922229,
"grad_norm": 0.15543967485427856,
"kl": 7.2479248046875e-05,
"learning_rate": 3.8461538461538463e-07,
"loss": 0.0843,
"reward": 0.4232407733798027,
"reward_std": 0.2356618531048298,
"rewards/improved_len_reward_dast": 0.4232407733798027,
"step": 15
},
{
"completion_length": 1313.4540405273438,
"epoch": 0.04147764095917045,
"grad_norm": 0.1376647651195526,
"kl": 9.846687316894531e-05,
"learning_rate": 4.1025641025641024e-07,
"loss": -0.0437,
"reward": 0.4939410910010338,
"reward_std": 0.3177715875208378,
"rewards/improved_len_reward_dast": 0.4939410910010338,
"step": 16
},
{
"completion_length": 2019.0994873046875,
"epoch": 0.0440699935191186,
"grad_norm": 0.11130757629871368,
"kl": 0.00011444091796875,
"learning_rate": 4.358974358974359e-07,
"loss": 0.0027,
"reward": 0.4330388903617859,
"reward_std": 0.37679746001958847,
"rewards/improved_len_reward_dast": 0.4330388903617859,
"step": 17
},
{
"completion_length": 1429.14794921875,
"epoch": 0.046662346079066754,
"grad_norm": 0.1924666464328766,
"kl": 8.881092071533203e-05,
"learning_rate": 4.6153846153846156e-07,
"loss": 0.0804,
"reward": 0.30921216681599617,
"reward_std": 0.2913207747042179,
"rewards/improved_len_reward_dast": 0.30921216681599617,
"step": 18
},
{
"completion_length": 1738.7474060058594,
"epoch": 0.0492546986390149,
"grad_norm": 0.13304243981838226,
"kl": 0.0001150369644165039,
"learning_rate": 4.871794871794871e-07,
"loss": 0.0111,
"reward": 0.38398153707385063,
"reward_std": 0.34359200298786163,
"rewards/improved_len_reward_dast": 0.38398153707385063,
"step": 19
},
{
"completion_length": 1761.8928527832031,
"epoch": 0.05184705119896306,
"grad_norm": 0.13784411549568176,
"kl": 0.00011897087097167969,
"learning_rate": 5.128205128205127e-07,
"loss": -0.0009,
"reward": 0.31885702908039093,
"reward_std": 0.3634636849164963,
"rewards/improved_len_reward_dast": 0.31885702908039093,
"step": 20
},
{
"completion_length": 2171.124969482422,
"epoch": 0.05443940375891121,
"grad_norm": 0.13725849986076355,
"kl": 0.00013327598571777344,
"learning_rate": 5.384615384615384e-07,
"loss": 0.003,
"reward": 0.28723688423633575,
"reward_std": 0.3837554454803467,
"rewards/improved_len_reward_dast": 0.28723688423633575,
"step": 21
},
{
"completion_length": 1959.369873046875,
"epoch": 0.057031756318859365,
"grad_norm": 0.14588049054145813,
"kl": 0.00011110305786132812,
"learning_rate": 5.641025641025641e-07,
"loss": 0.0188,
"reward": 0.39568372815847397,
"reward_std": 0.3404585272073746,
"rewards/improved_len_reward_dast": 0.39568372815847397,
"step": 22
},
{
"completion_length": 1891.4412536621094,
"epoch": 0.059624108878807515,
"grad_norm": 0.11357180029153824,
"kl": 0.00010156631469726562,
"learning_rate": 5.897435897435898e-07,
"loss": 0.0197,
"reward": 0.49445799738168716,
"reward_std": 0.2365701049566269,
"rewards/improved_len_reward_dast": 0.49445799738168716,
"step": 23
},
{
"completion_length": 1616.0025482177734,
"epoch": 0.06221646143875567,
"grad_norm": 0.12517189979553223,
"kl": 0.0001264810562133789,
"learning_rate": 6.153846153846154e-07,
"loss": 0.0208,
"reward": 0.38593798875808716,
"reward_std": 0.26876550912857056,
"rewards/improved_len_reward_dast": 0.38593798875808716,
"step": 24
},
{
"completion_length": 2217.3290405273438,
"epoch": 0.06480881399870382,
"grad_norm": 0.12683141231536865,
"kl": 0.00015163421630859375,
"learning_rate": 6.410256410256411e-07,
"loss": 0.0211,
"reward": 0.35348474979400635,
"reward_std": 0.2698053792119026,
"rewards/improved_len_reward_dast": 0.35348474979400635,
"step": 25
},
{
"completion_length": 1800.8596801757812,
"epoch": 0.06740116655865197,
"grad_norm": 0.13101568818092346,
"kl": 0.00011932849884033203,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0015,
"reward": 0.40977882593870163,
"reward_std": 0.31720418483018875,
"rewards/improved_len_reward_dast": 0.40977882593870163,
"step": 26
},
{
"completion_length": 1796.7372131347656,
"epoch": 0.06999351911860013,
"grad_norm": 0.13930906355381012,
"kl": 0.0001131296157836914,
"learning_rate": 6.923076923076922e-07,
"loss": 0.0671,
"reward": 0.4471106305718422,
"reward_std": 0.2687300704419613,
"rewards/improved_len_reward_dast": 0.4471106305718422,
"step": 27
},
{
"completion_length": 1676.813720703125,
"epoch": 0.07258587167854828,
"grad_norm": 0.14517556130886078,
"kl": 0.0001157522201538086,
"learning_rate": 7.179487179487179e-07,
"loss": 0.0433,
"reward": 0.4442668706178665,
"reward_std": 0.23423199355602264,
"rewards/improved_len_reward_dast": 0.4442668706178665,
"step": 28
},
{
"completion_length": 2002.9489440917969,
"epoch": 0.07517822423849643,
"grad_norm": 0.12484736740589142,
"kl": 0.0001323223114013672,
"learning_rate": 7.435897435897435e-07,
"loss": -0.0168,
"reward": 0.4897717013955116,
"reward_std": 0.24505353346467018,
"rewards/improved_len_reward_dast": 0.4897717013955116,
"step": 29
},
{
"completion_length": 1715.0433349609375,
"epoch": 0.07777057679844458,
"grad_norm": 0.11366493999958038,
"kl": 8.857250213623047e-05,
"learning_rate": 7.692307692307693e-07,
"loss": -0.0119,
"reward": 0.24165286868810654,
"reward_std": 0.2886221148073673,
"rewards/improved_len_reward_dast": 0.24165286868810654,
"step": 30
},
{
"completion_length": 2232.938751220703,
"epoch": 0.08036292935839275,
"grad_norm": 0.10916973650455475,
"kl": 0.00017595291137695312,
"learning_rate": 7.948717948717948e-07,
"loss": 0.0336,
"reward": 0.5083014816045761,
"reward_std": 0.25778181850910187,
"rewards/improved_len_reward_dast": 0.5083014816045761,
"step": 31
},
{
"completion_length": 1905.7474060058594,
"epoch": 0.0829552819183409,
"grad_norm": 0.15653526782989502,
"kl": 0.00012862682342529297,
"learning_rate": 8.205128205128205e-07,
"loss": -0.0477,
"reward": 0.2919162670150399,
"reward_std": 0.3391455188393593,
"rewards/improved_len_reward_dast": 0.2919162670150399,
"step": 32
},
{
"completion_length": 1948.9158325195312,
"epoch": 0.08554763447828904,
"grad_norm": 0.1198260486125946,
"kl": 0.00013387203216552734,
"learning_rate": 8.461538461538461e-07,
"loss": 0.0302,
"reward": 0.4744948521256447,
"reward_std": 0.31309082731604576,
"rewards/improved_len_reward_dast": 0.4744948521256447,
"step": 33
},
{
"completion_length": 1995.9464111328125,
"epoch": 0.0881399870382372,
"grad_norm": 0.11656484007835388,
"kl": 0.0001513957977294922,
"learning_rate": 8.717948717948718e-07,
"loss": -0.012,
"reward": 0.30143800005316734,
"reward_std": 0.3431224897503853,
"rewards/improved_len_reward_dast": 0.30143800005316734,
"step": 34
},
{
"completion_length": 2503.1912536621094,
"epoch": 0.09073233959818536,
"grad_norm": 0.12246429920196533,
"kl": 0.00017714500427246094,
"learning_rate": 8.974358974358974e-07,
"loss": -0.0444,
"reward": 0.19646108895540237,
"reward_std": 0.2645679712295532,
"rewards/improved_len_reward_dast": 0.19646108895540237,
"step": 35
},
{
"completion_length": 2238.3545532226562,
"epoch": 0.09332469215813351,
"grad_norm": 0.14648236334323883,
"kl": 0.00016880035400390625,
"learning_rate": 9.230769230769231e-07,
"loss": 0.0673,
"reward": 0.4503837898373604,
"reward_std": 0.21853860095143318,
"rewards/improved_len_reward_dast": 0.4503837898373604,
"step": 36
},
{
"completion_length": 2044.1734313964844,
"epoch": 0.09591704471808166,
"grad_norm": 0.12917490303516388,
"kl": 0.00019884109497070312,
"learning_rate": 9.487179487179486e-07,
"loss": 0.0635,
"reward": 0.47461262345314026,
"reward_std": 0.2628549002110958,
"rewards/improved_len_reward_dast": 0.47461262345314026,
"step": 37
},
{
"completion_length": 1669.1275329589844,
"epoch": 0.0985093972780298,
"grad_norm": 0.20408552885055542,
"kl": 0.00013911724090576172,
"learning_rate": 9.743589743589742e-07,
"loss": 0.0726,
"reward": 0.4879928305745125,
"reward_std": 0.2501045912504196,
"rewards/improved_len_reward_dast": 0.4879928305745125,
"step": 38
},
{
"completion_length": 2456.553466796875,
"epoch": 0.10110174983797797,
"grad_norm": 0.14151060581207275,
"kl": 0.00017547607421875,
"learning_rate": 1e-06,
"loss": 0.0097,
"reward": 0.19765825755894184,
"reward_std": 0.32448211312294006,
"rewards/improved_len_reward_dast": 0.19765825755894184,
"step": 39
},
{
"completion_length": 2550.58154296875,
"epoch": 0.10369410239792612,
"grad_norm": 0.12014975398778915,
"kl": 0.00021982192993164062,
"learning_rate": 9.99981450718918e-07,
"loss": 0.0362,
"reward": 0.24847418442368507,
"reward_std": 0.2784438841044903,
"rewards/improved_len_reward_dast": 0.24847418442368507,
"step": 40
},
{
"completion_length": 1850.7372131347656,
"epoch": 0.10628645495787427,
"grad_norm": 0.13483257591724396,
"kl": 0.00026726722717285156,
"learning_rate": 9.99925804404898e-07,
"loss": -0.0137,
"reward": 0.23939451575279236,
"reward_std": 0.3702044114470482,
"rewards/improved_len_reward_dast": 0.23939451575279236,
"step": 41
},
{
"completion_length": 2014.2091674804688,
"epoch": 0.10887880751782242,
"grad_norm": 0.12142825126647949,
"kl": 0.00023293495178222656,
"learning_rate": 9.998330656454915e-07,
"loss": 0.0555,
"reward": 0.48714711517095566,
"reward_std": 0.2657182738184929,
"rewards/improved_len_reward_dast": 0.48714711517095566,
"step": 42
},
{
"completion_length": 1960.7218933105469,
"epoch": 0.11147116007777058,
"grad_norm": 0.1477634757757187,
"kl": 0.0003228187561035156,
"learning_rate": 9.99703242086198e-07,
"loss": 0.038,
"reward": 0.4138510562479496,
"reward_std": 0.2605874165892601,
"rewards/improved_len_reward_dast": 0.4138510562479496,
"step": 43
},
{
"completion_length": 1840.2117309570312,
"epoch": 0.11406351263771873,
"grad_norm": 0.13796693086624146,
"kl": 0.00024700164794921875,
"learning_rate": 9.995363444298333e-07,
"loss": 0.019,
"reward": 0.5052988603711128,
"reward_std": 0.26378944143652916,
"rewards/improved_len_reward_dast": 0.5052988603711128,
"step": 44
},
{
"completion_length": 2340.3392639160156,
"epoch": 0.11665586519766688,
"grad_norm": 0.11008527874946594,
"kl": 0.00039124488830566406,
"learning_rate": 9.993323864356492e-07,
"loss": 0.0052,
"reward": 0.22386901453137398,
"reward_std": 0.31205885112285614,
"rewards/improved_len_reward_dast": 0.22386901453137398,
"step": 45
},
{
"completion_length": 2944.1682739257812,
"epoch": 0.11924821775761503,
"grad_norm": 0.0939781591296196,
"kl": 0.0002903938293457031,
"learning_rate": 9.990913849181977e-07,
"loss": 0.0107,
"reward": 0.27933235839009285,
"reward_std": 0.28421058878302574,
"rewards/improved_len_reward_dast": 0.27933235839009285,
"step": 46
},
{
"completion_length": 1859.7703704833984,
"epoch": 0.1218405703175632,
"grad_norm": 0.15788155794143677,
"kl": 0.0004353523254394531,
"learning_rate": 9.988133597459444e-07,
"loss": 0.0434,
"reward": 0.387714684009552,
"reward_std": 0.2896231710910797,
"rewards/improved_len_reward_dast": 0.387714684009552,
"step": 47
},
{
"completion_length": 1945.9565734863281,
"epoch": 0.12443292287751134,
"grad_norm": 0.2036711722612381,
"kl": 0.0004405975341796875,
"learning_rate": 9.984983338396323e-07,
"loss": 0.0847,
"reward": 0.4099316783249378,
"reward_std": 0.23737533017992973,
"rewards/improved_len_reward_dast": 0.4099316783249378,
"step": 48
},
{
"completion_length": 1638.6071166992188,
"epoch": 0.1270252754374595,
"grad_norm": 0.15458206832408905,
"kl": 0.00054168701171875,
"learning_rate": 9.981463331703903e-07,
"loss": 0.0575,
"reward": 0.5117293447256088,
"reward_std": 0.24939828738570213,
"rewards/improved_len_reward_dast": 0.5117293447256088,
"step": 49
},
{
"completion_length": 2023.471923828125,
"epoch": 0.12961762799740764,
"grad_norm": 0.14058855175971985,
"kl": 0.0006809234619140625,
"learning_rate": 9.977573867575937e-07,
"loss": -0.0192,
"reward": 0.35891789197921753,
"reward_std": 0.3240862749516964,
"rewards/improved_len_reward_dast": 0.35891789197921753,
"step": 50
},
{
"completion_length": 2265.3264770507812,
"epoch": 0.1322099805573558,
"grad_norm": 0.12996900081634521,
"kl": 0.0007781982421875,
"learning_rate": 9.9733152666647e-07,
"loss": -0.0032,
"reward": 0.420456662774086,
"reward_std": 0.30560050904750824,
"rewards/improved_len_reward_dast": 0.420456662774086,
"step": 51
},
{
"completion_length": 2654.7677612304688,
"epoch": 0.13480233311730394,
"grad_norm": 0.1508190631866455,
"kl": 0.0006885528564453125,
"learning_rate": 9.968687880054579e-07,
"loss": 0.0619,
"reward": 0.46665582805871964,
"reward_std": 0.26216986030340195,
"rewards/improved_len_reward_dast": 0.46665582805871964,
"step": 52
},
{
"completion_length": 1829.56884765625,
"epoch": 0.1373946856772521,
"grad_norm": 0.15750233829021454,
"kl": 0.001232147216796875,
"learning_rate": 9.963692089233104e-07,
"loss": 0.0603,
"reward": 0.35224995017051697,
"reward_std": 0.27216843515634537,
"rewards/improved_len_reward_dast": 0.35224995017051697,
"step": 53
},
{
"completion_length": 1852.9693298339844,
"epoch": 0.13998703823720027,
"grad_norm": 0.14929921925067902,
"kl": 0.0011272430419921875,
"learning_rate": 9.958328306059508e-07,
"loss": -0.0025,
"reward": 0.391354002058506,
"reward_std": 0.24403201416134834,
"rewards/improved_len_reward_dast": 0.391354002058506,
"step": 54
},
{
"completion_length": 2053.5662536621094,
"epoch": 0.1425793907971484,
"grad_norm": 0.12911133468151093,
"kl": 0.0014352798461914062,
"learning_rate": 9.952596972730782e-07,
"loss": -0.0058,
"reward": 0.3265633024275303,
"reward_std": 0.2745610848069191,
"rewards/improved_len_reward_dast": 0.3265633024275303,
"step": 55
},
{
"completion_length": 2230.5382690429688,
"epoch": 0.14517174335709657,
"grad_norm": 0.13832230865955353,
"kl": 0.001399993896484375,
"learning_rate": 9.946498561745201e-07,
"loss": 0.0347,
"reward": 0.4754156991839409,
"reward_std": 0.27349359542131424,
"rewards/improved_len_reward_dast": 0.4754156991839409,
"step": 56
},
{
"completion_length": 2175.3060607910156,
"epoch": 0.14776409591704473,
"grad_norm": 0.13634833693504333,
"kl": 0.0013599395751953125,
"learning_rate": 9.94003357586339e-07,
"loss": 0.011,
"reward": 0.535503476858139,
"reward_std": 0.25894078612327576,
"rewards/improved_len_reward_dast": 0.535503476858139,
"step": 57
},
{
"completion_length": 2355.7091369628906,
"epoch": 0.15035644847699287,
"grad_norm": 0.11235704272985458,
"kl": 0.0012407302856445312,
"learning_rate": 9.933202548066855e-07,
"loss": 0.0133,
"reward": 0.4234941601753235,
"reward_std": 0.24918782338500023,
"rewards/improved_len_reward_dast": 0.4234941601753235,
"step": 58
},
{
"completion_length": 2449.9718627929688,
"epoch": 0.15294880103694103,
"grad_norm": 0.11569388210773468,
"kl": 0.001361846923828125,
"learning_rate": 9.926006041514068e-07,
"loss": 0.0195,
"reward": 0.42598315328359604,
"reward_std": 0.3036581464111805,
"rewards/improved_len_reward_dast": 0.42598315328359604,
"step": 59
},
{
"completion_length": 2177.0101623535156,
"epoch": 0.15554115359688916,
"grad_norm": 0.15036629140377045,
"kl": 0.002197265625,
"learning_rate": 9.918444649494012e-07,
"loss": 0.0499,
"reward": 0.47116725891828537,
"reward_std": 0.2249010019004345,
"rewards/improved_len_reward_dast": 0.47116725891828537,
"step": 60
},
{
"completion_length": 2315.211669921875,
"epoch": 0.15813350615683733,
"grad_norm": 0.12630639970302582,
"kl": 0.002147674560546875,
"learning_rate": 9.9105189953773e-07,
"loss": 0.0155,
"reward": 0.4634978622198105,
"reward_std": 0.3047446385025978,
"rewards/improved_len_reward_dast": 0.4634978622198105,
"step": 61
},
{
"completion_length": 2431.4208374023438,
"epoch": 0.1607258587167855,
"grad_norm": 0.13092079758644104,
"kl": 0.0019016265869140625,
"learning_rate": 9.90222973256475e-07,
"loss": 0.029,
"reward": 0.5218361169099808,
"reward_std": 0.3062875270843506,
"rewards/improved_len_reward_dast": 0.5218361169099808,
"step": 62
},
{
"completion_length": 2737.2295532226562,
"epoch": 0.16331821127673363,
"grad_norm": 0.12846292555332184,
"kl": 0.001842498779296875,
"learning_rate": 9.89357754443355e-07,
"loss": 0.0341,
"reward": 0.3177746832370758,
"reward_std": 0.2347201406955719,
"rewards/improved_len_reward_dast": 0.3177746832370758,
"step": 63
},
{
"completion_length": 2820.30859375,
"epoch": 0.1659105638366818,
"grad_norm": 0.11326766014099121,
"kl": 0.002269744873046875,
"learning_rate": 9.884563144280897e-07,
"loss": 0.0303,
"reward": 0.40527529269456863,
"reward_std": 0.27593884617090225,
"rewards/improved_len_reward_dast": 0.40527529269456863,
"step": 64
},
{
"completion_length": 2124.1249389648438,
"epoch": 0.16850291639662995,
"grad_norm": 0.16672682762145996,
"kl": 0.002834320068359375,
"learning_rate": 9.875187275265198e-07,
"loss": 0.0854,
"reward": 0.44389794766902924,
"reward_std": 0.239344272762537,
"rewards/improved_len_reward_dast": 0.44389794766902924,
"step": 65
},
{
"completion_length": 2648.7473754882812,
"epoch": 0.1710952689565781,
"grad_norm": 0.12017077952623367,
"kl": 0.002216339111328125,
"learning_rate": 9.865450710344807e-07,
"loss": 0.0354,
"reward": 0.34619488939642906,
"reward_std": 0.23249895870685577,
"rewards/improved_len_reward_dast": 0.34619488939642906,
"step": 66
},
{
"completion_length": 2351.4132385253906,
"epoch": 0.17368762151652625,
"grad_norm": 0.13515809178352356,
"kl": 0.00341796875,
"learning_rate": 9.855354252214307e-07,
"loss": 0.0206,
"reward": 0.4116981029510498,
"reward_std": 0.28415245935320854,
"rewards/improved_len_reward_dast": 0.4116981029510498,
"step": 67
},
{
"completion_length": 1852.6810302734375,
"epoch": 0.1762799740764744,
"grad_norm": 0.17753329873085022,
"kl": 0.003093719482421875,
"learning_rate": 9.844898733238311e-07,
"loss": 0.0359,
"reward": 0.5103119313716888,
"reward_std": 0.28220145776867867,
"rewards/improved_len_reward_dast": 0.5103119313716888,
"step": 68
},
{
"completion_length": 2115.632568359375,
"epoch": 0.17887232663642255,
"grad_norm": 0.14785140752792358,
"kl": 0.003803253173828125,
"learning_rate": 9.83408501538287e-07,
"loss": 0.0448,
"reward": 0.4045008569955826,
"reward_std": 0.27792026475071907,
"rewards/improved_len_reward_dast": 0.4045008569955826,
"step": 69
},
{
"completion_length": 1836.5025329589844,
"epoch": 0.18146467919637072,
"grad_norm": 0.13213248550891876,
"kl": 0.003261566162109375,
"learning_rate": 9.822913990144387e-07,
"loss": 0.0005,
"reward": 0.3931305408477783,
"reward_std": 0.30834557116031647,
"rewards/improved_len_reward_dast": 0.3931305408477783,
"step": 70
},
{
"completion_length": 2272.7142639160156,
"epoch": 0.18405703175631885,
"grad_norm": 0.18136066198349,
"kl": 0.0032138824462890625,
"learning_rate": 9.811386578476146e-07,
"loss": 0.0901,
"reward": 0.5118075683712959,
"reward_std": 0.24460211768746376,
"rewards/improved_len_reward_dast": 0.5118075683712959,
"step": 71
},
{
"completion_length": 2046.1580810546875,
"epoch": 0.18664938431626701,
"grad_norm": 0.16394267976284027,
"kl": 0.003330230712890625,
"learning_rate": 9.79950373071236e-07,
"loss": 0.0388,
"reward": 0.476028174161911,
"reward_std": 0.2648630440235138,
"rewards/improved_len_reward_dast": 0.476028174161911,
"step": 72
},
{
"completion_length": 2163.252471923828,
"epoch": 0.18924173687621518,
"grad_norm": 0.14138321578502655,
"kl": 0.004444122314453125,
"learning_rate": 9.787266426489845e-07,
"loss": 0.0132,
"reward": 0.441867433488369,
"reward_std": 0.2500956766307354,
"rewards/improved_len_reward_dast": 0.441867433488369,
"step": 73
},
{
"completion_length": 2289.507598876953,
"epoch": 0.1918340894361633,
"grad_norm": 0.1288045346736908,
"kl": 0.00394439697265625,
"learning_rate": 9.77467567466725e-07,
"loss": 0.005,
"reward": 0.4050723984837532,
"reward_std": 0.3099602647125721,
"rewards/improved_len_reward_dast": 0.4050723984837532,
"step": 74
},
{
"completion_length": 2383.9718627929688,
"epoch": 0.19442644199611148,
"grad_norm": 0.14801673591136932,
"kl": 0.004150390625,
"learning_rate": 9.761732513241882e-07,
"loss": 0.052,
"reward": 0.4946385696530342,
"reward_std": 0.2349606677889824,
"rewards/improved_len_reward_dast": 0.4946385696530342,
"step": 75
},
{
"completion_length": 2045.8979187011719,
"epoch": 0.1970187945560596,
"grad_norm": 0.17007729411125183,
"kl": 0.0055694580078125,
"learning_rate": 9.748438009264142e-07,
"loss": 0.0672,
"reward": 0.5539986491203308,
"reward_std": 0.20796510577201843,
"rewards/improved_len_reward_dast": 0.5539986491203308,
"step": 76
},
{
"completion_length": 2371.7626953125,
"epoch": 0.19961114711600778,
"grad_norm": 0.12920302152633667,
"kl": 0.00461578369140625,
"learning_rate": 9.734793258749538e-07,
"loss": 0.0104,
"reward": 0.5021207295358181,
"reward_std": 0.2304290495812893,
"rewards/improved_len_reward_dast": 0.5021207295358181,
"step": 77
},
{
"completion_length": 2612.2882080078125,
"epoch": 0.20220349967595594,
"grad_norm": 0.12678897380828857,
"kl": 0.00510406494140625,
"learning_rate": 9.720799386588358e-07,
"loss": 0.0317,
"reward": 0.4242451824247837,
"reward_std": 0.2596823424100876,
"rewards/improved_len_reward_dast": 0.4242451824247837,
"step": 78
},
{
"completion_length": 2259.831573486328,
"epoch": 0.20479585223590407,
"grad_norm": 0.1500682830810547,
"kl": 0.00511932373046875,
"learning_rate": 9.706457546452898e-07,
"loss": -0.0061,
"reward": 0.42074430361390114,
"reward_std": 0.24277805909514427,
"rewards/improved_len_reward_dast": 0.42074430361390114,
"step": 79
},
{
"completion_length": 2893.040771484375,
"epoch": 0.20738820479585224,
"grad_norm": 0.10854899138212204,
"kl": 0.004856109619140625,
"learning_rate": 9.691768920702379e-07,
"loss": -0.0021,
"reward": 0.33654000610113144,
"reward_std": 0.1983367819339037,
"rewards/improved_len_reward_dast": 0.33654000610113144,
"step": 80
},
{
"completion_length": 2305.7474365234375,
"epoch": 0.2099805573558004,
"grad_norm": 0.12965475022792816,
"kl": 0.00487518310546875,
"learning_rate": 9.676734720285456e-07,
"loss": -0.0154,
"reward": 0.4322432279586792,
"reward_std": 0.2519207112491131,
"rewards/improved_len_reward_dast": 0.4322432279586792,
"step": 81
},
{
"completion_length": 2196.4744567871094,
"epoch": 0.21257290991574854,
"grad_norm": 0.23942671716213226,
"kl": 0.01129150390625,
"learning_rate": 9.661356184640394e-07,
"loss": 0.0429,
"reward": 0.4929245412349701,
"reward_std": 0.21502425894141197,
"rewards/improved_len_reward_dast": 0.4929245412349701,
"step": 82
},
{
"completion_length": 2185.619873046875,
"epoch": 0.2151652624756967,
"grad_norm": 0.1231599673628807,
"kl": 0.00595855712890625,
"learning_rate": 9.64563458159288e-07,
"loss": 0.0282,
"reward": 0.4786108732223511,
"reward_std": 0.28226276487112045,
"rewards/improved_len_reward_dast": 0.4786108732223511,
"step": 83
},
{
"completion_length": 2437.6173400878906,
"epoch": 0.21775761503564484,
"grad_norm": 0.14213241636753082,
"kl": 0.005523681640625,
"learning_rate": 9.629571207251515e-07,
"loss": 0.0442,
"reward": 0.48524054139852524,
"reward_std": 0.22880307212471962,
"rewards/improved_len_reward_dast": 0.48524054139852524,
"step": 84
},
{
"completion_length": 2338.068817138672,
"epoch": 0.220349967595593,
"grad_norm": 0.1439816653728485,
"kl": 0.0064239501953125,
"learning_rate": 9.613167385900944e-07,
"loss": 0.0381,
"reward": 0.32151066698133945,
"reward_std": 0.2240244559943676,
"rewards/improved_len_reward_dast": 0.32151066698133945,
"step": 85
},
{
"completion_length": 2180.928497314453,
"epoch": 0.22294232015554116,
"grad_norm": 0.1326073408126831,
"kl": 0.00629425048828125,
"learning_rate": 9.59642446989269e-07,
"loss": -0.0225,
"reward": 0.46004387736320496,
"reward_std": 0.29043208435177803,
"rewards/improved_len_reward_dast": 0.46004387736320496,
"step": 86
},
{
"completion_length": 2378.7499389648438,
"epoch": 0.2255346727154893,
"grad_norm": 0.13580094277858734,
"kl": 0.00611114501953125,
"learning_rate": 9.579343839533668e-07,
"loss": 0.0344,
"reward": 0.46859942376613617,
"reward_std": 0.213957991451025,
"rewards/improved_len_reward_dast": 0.46859942376613617,
"step": 87
},
{
"completion_length": 2216.5968627929688,
"epoch": 0.22812702527543746,
"grad_norm": 0.1301027089357376,
"kl": 0.00576019287109375,
"learning_rate": 9.561926902972378e-07,
"loss": 0.0187,
"reward": 0.4914589300751686,
"reward_std": 0.2609393447637558,
"rewards/improved_len_reward_dast": 0.4914589300751686,
"step": 88
},
{
"completion_length": 1875.9923095703125,
"epoch": 0.23071937783538563,
"grad_norm": 0.14184825122356415,
"kl": 0.00583648681640625,
"learning_rate": 9.544175096082838e-07,
"loss": 0.0363,
"reward": 0.5709837153553963,
"reward_std": 0.2582616098225117,
"rewards/improved_len_reward_dast": 0.5709837153553963,
"step": 89
},
{
"completion_length": 2039.7499694824219,
"epoch": 0.23331173039533376,
"grad_norm": 0.14913320541381836,
"kl": 0.00595855712890625,
"learning_rate": 9.526089882346172e-07,
"loss": 0.045,
"reward": 0.4515961930155754,
"reward_std": 0.2736925035715103,
"rewards/improved_len_reward_dast": 0.4515961930155754,
"step": 90
},
{
"completion_length": 2091.1401977539062,
"epoch": 0.23590408295528192,
"grad_norm": 0.15127432346343994,
"kl": 0.005279541015625,
"learning_rate": 9.507672752730001e-07,
"loss": 0.0054,
"reward": 0.402485728263855,
"reward_std": 0.29755549877882004,
"rewards/improved_len_reward_dast": 0.402485728263855,
"step": 91
},
{
"completion_length": 2158.0076293945312,
"epoch": 0.23849643551523006,
"grad_norm": 0.13994944095611572,
"kl": 0.0059356689453125,
"learning_rate": 9.4889252255655e-07,
"loss": 0.0189,
"reward": 0.4199903607368469,
"reward_std": 0.2342899888753891,
"rewards/improved_len_reward_dast": 0.4199903607368469,
"step": 92
},
{
"completion_length": 2170.688720703125,
"epoch": 0.24108878807517822,
"grad_norm": 0.1373533308506012,
"kl": 0.00689697265625,
"learning_rate": 9.469848846422223e-07,
"loss": -0.0002,
"reward": 0.2835959419608116,
"reward_std": 0.25099899992346764,
"rewards/improved_len_reward_dast": 0.2835959419608116,
"step": 93
},
{
"completion_length": 2395.0025329589844,
"epoch": 0.2436811406351264,
"grad_norm": 0.1902032494544983,
"kl": 0.00583648681640625,
"learning_rate": 9.450445187980699e-07,
"loss": 0.0584,
"reward": 0.35247352346777916,
"reward_std": 0.30481256917119026,
"rewards/improved_len_reward_dast": 0.35247352346777916,
"step": 94
},
{
"completion_length": 2286.4693603515625,
"epoch": 0.24627349319507452,
"grad_norm": 0.13147291541099548,
"kl": 0.00756072998046875,
"learning_rate": 9.430715849902774e-07,
"loss": -0.0056,
"reward": 0.4530554786324501,
"reward_std": 0.25667278096079826,
"rewards/improved_len_reward_dast": 0.4530554786324501,
"step": 95
},
{
"completion_length": 2043.8111877441406,
"epoch": 0.24886584575502269,
"grad_norm": 0.18014930188655853,
"kl": 0.00627899169921875,
"learning_rate": 9.410662458699723e-07,
"loss": 0.0794,
"reward": 0.48161032050848007,
"reward_std": 0.21756469458341599,
"rewards/improved_len_reward_dast": 0.48161032050848007,
"step": 96
},
{
"completion_length": 1489.5254821777344,
"epoch": 0.25145819831497085,
"grad_norm": 0.17594148218631744,
"kl": 0.0061187744140625,
"learning_rate": 9.390286667598169e-07,
"loss": 0.0505,
"reward": 0.4777970463037491,
"reward_std": 0.2606087028980255,
"rewards/improved_len_reward_dast": 0.4777970463037491,
"step": 97
},
{
"completion_length": 1849.7627258300781,
"epoch": 0.254050550874919,
"grad_norm": 0.18699869513511658,
"kl": 0.00595855712890625,
"learning_rate": 9.369590156403784e-07,
"loss": 0.021,
"reward": 0.5154428780078888,
"reward_std": 0.25940926000475883,
"rewards/improved_len_reward_dast": 0.5154428780078888,
"step": 98
},
{
"completion_length": 1921.4820861816406,
"epoch": 0.2566429034348671,
"grad_norm": 0.17932431399822235,
"kl": 0.00734710693359375,
"learning_rate": 9.348574631362808e-07,
"loss": 0.0448,
"reward": 0.518772654235363,
"reward_std": 0.22472433000802994,
"rewards/improved_len_reward_dast": 0.518772654235363,
"step": 99
},
{
"completion_length": 1558.698959350586,
"epoch": 0.2592352559948153,
"grad_norm": 0.2216687798500061,
"kl": 0.00499725341796875,
"learning_rate": 9.327241825021379e-07,
"loss": 0.0994,
"reward": 0.5730864778161049,
"reward_std": 0.23495277762413025,
"rewards/improved_len_reward_dast": 0.5730864778161049,
"step": 100
},
{
"completion_length": 2075.681121826172,
"epoch": 0.26182760855476345,
"grad_norm": 0.17743688821792603,
"kl": 0.00689697265625,
"learning_rate": 9.3055934960827e-07,
"loss": 0.0504,
"reward": 0.49749719351530075,
"reward_std": 0.24989648535847664,
"rewards/improved_len_reward_dast": 0.49749719351530075,
"step": 101
},
{
"completion_length": 2060.5254821777344,
"epoch": 0.2644199611147116,
"grad_norm": 0.1442503184080124,
"kl": 0.00804901123046875,
"learning_rate": 9.283631429262053e-07,
"loss": 0.0444,
"reward": 0.5253574028611183,
"reward_std": 0.26894206926226616,
"rewards/improved_len_reward_dast": 0.5253574028611183,
"step": 102
},
{
"completion_length": 1975.1708679199219,
"epoch": 0.2670123136746598,
"grad_norm": 0.1966535896062851,
"kl": 0.00787353515625,
"learning_rate": 9.261357435139665e-07,
"loss": -0.0423,
"reward": 0.3904944434762001,
"reward_std": 0.252847608178854,
"rewards/improved_len_reward_dast": 0.3904944434762001,
"step": 103
},
{
"completion_length": 1903.8239440917969,
"epoch": 0.2696046662346079,
"grad_norm": 0.16377773880958557,
"kl": 0.00766754150390625,
"learning_rate": 9.238773350011437e-07,
"loss": 0.0337,
"reward": 0.5364998355507851,
"reward_std": 0.22807660326361656,
"rewards/improved_len_reward_dast": 0.5364998355507851,
"step": 104
},
{
"completion_length": 1964.03564453125,
"epoch": 0.27219701879455604,
"grad_norm": 0.15731562674045563,
"kl": 0.00638580322265625,
"learning_rate": 9.215881035737557e-07,
"loss": 0.03,
"reward": 0.5687462911009789,
"reward_std": 0.24159640073776245,
"rewards/improved_len_reward_dast": 0.5687462911009789,
"step": 105
},
{
"completion_length": 2086.864776611328,
"epoch": 0.2747893713545042,
"grad_norm": 0.1402043104171753,
"kl": 0.007659912109375,
"learning_rate": 9.192682379589017e-07,
"loss": 0.0097,
"reward": 0.5089325457811356,
"reward_std": 0.3301768898963928,
"rewards/improved_len_reward_dast": 0.5089325457811356,
"step": 106
},
{
"completion_length": 2234.5228576660156,
"epoch": 0.27738172391445237,
"grad_norm": 0.11498509347438812,
"kl": 0.0081634521484375,
"learning_rate": 9.169179294092006e-07,
"loss": 0.0083,
"reward": 0.4969679266214371,
"reward_std": 0.23030569776892662,
"rewards/improved_len_reward_dast": 0.4969679266214371,
"step": 107
},
{
"completion_length": 1707.2627258300781,
"epoch": 0.27997407647440054,
"grad_norm": 0.14997334778308868,
"kl": 0.006744384765625,
"learning_rate": 9.145373716870257e-07,
"loss": 0.0103,
"reward": 0.49637529253959656,
"reward_std": 0.2502065673470497,
"rewards/improved_len_reward_dast": 0.49637529253959656,
"step": 108
},
{
"completion_length": 2420.5662231445312,
"epoch": 0.2825664290343487,
"grad_norm": 0.18093426525592804,
"kl": 0.009765625,
"learning_rate": 9.121267610485294e-07,
"loss": 0.0507,
"reward": 0.45108526200056076,
"reward_std": 0.27778685092926025,
"rewards/improved_len_reward_dast": 0.45108526200056076,
"step": 109
},
{
"completion_length": 1913.4413146972656,
"epoch": 0.2851587815942968,
"grad_norm": 0.129420667886734,
"kl": 0.00807952880859375,
"learning_rate": 9.096862962274642e-07,
"loss": 0.0184,
"reward": 0.47038712725043297,
"reward_std": 0.23162546008825302,
"rewards/improved_len_reward_dast": 0.47038712725043297,
"step": 110
},
{
"completion_length": 2138.313751220703,
"epoch": 0.28775113415424497,
"grad_norm": 0.13907843828201294,
"kl": 0.00921630859375,
"learning_rate": 9.072161784187988e-07,
"loss": 0.003,
"reward": 0.39113760739564896,
"reward_std": 0.31567446142435074,
"rewards/improved_len_reward_dast": 0.39113760739564896,
"step": 111
},
{
"completion_length": 1697.0076293945312,
"epoch": 0.29034348671419313,
"grad_norm": 0.1390790045261383,
"kl": 0.00740814208984375,
"learning_rate": 9.047166112621312e-07,
"loss": 0.0311,
"reward": 0.4854539856314659,
"reward_std": 0.25831175222992897,
"rewards/improved_len_reward_dast": 0.4854539856314659,
"step": 112
},
{
"completion_length": 1920.2116394042969,
"epoch": 0.2929358392741413,
"grad_norm": 0.15448330342769623,
"kl": 0.008331298828125,
"learning_rate": 9.021878008249001e-07,
"loss": 0.0309,
"reward": 0.5033985450863838,
"reward_std": 0.2514248192310333,
"rewards/improved_len_reward_dast": 0.5033985450863838,
"step": 113
},
{
"completion_length": 1778.3877258300781,
"epoch": 0.29552819183408946,
"grad_norm": 0.18551130592823029,
"kl": 0.0085601806640625,
"learning_rate": 8.996299555853973e-07,
"loss": 0.0592,
"reward": 0.5357353314757347,
"reward_std": 0.25978413224220276,
"rewards/improved_len_reward_dast": 0.5357353314757347,
"step": 114
},
{
"completion_length": 1744.0815734863281,
"epoch": 0.29812054439403757,
"grad_norm": 0.1741955578327179,
"kl": 0.008270263671875,
"learning_rate": 8.970432864155798e-07,
"loss": 0.0581,
"reward": 0.44869130104780197,
"reward_std": 0.27859310433268547,
"rewards/improved_len_reward_dast": 0.44869130104780197,
"step": 115
},
{
"completion_length": 1891.1683044433594,
"epoch": 0.30071289695398573,
"grad_norm": 0.16727615892887115,
"kl": 0.0091094970703125,
"learning_rate": 8.944280065636851e-07,
"loss": -0.0155,
"reward": 0.5161800310015678,
"reward_std": 0.2364092469215393,
"rewards/improved_len_reward_dast": 0.5161800310015678,
"step": 116
},
{
"completion_length": 1760.517822265625,
"epoch": 0.3033052495139339,
"grad_norm": 0.16244257986545563,
"kl": 0.0087127685546875,
"learning_rate": 8.917843316366515e-07,
"loss": 0.0261,
"reward": 0.5459260642528534,
"reward_std": 0.23583999276161194,
"rewards/improved_len_reward_dast": 0.5459260642528534,
"step": 117
},
{
"completion_length": 1983.0484619140625,
"epoch": 0.30589760207388206,
"grad_norm": 0.1473926603794098,
"kl": 0.00963592529296875,
"learning_rate": 8.891124795823426e-07,
"loss": 0.0132,
"reward": 0.3325341437011957,
"reward_std": 0.25720784440636635,
"rewards/improved_len_reward_dast": 0.3325341437011957,
"step": 118
},
{
"completion_length": 1990.2703552246094,
"epoch": 0.3084899546338302,
"grad_norm": 0.15402625501155853,
"kl": 0.009613037109375,
"learning_rate": 8.864126706715796e-07,
"loss": 0.0303,
"reward": 0.47167035937309265,
"reward_std": 0.20868681743741035,
"rewards/improved_len_reward_dast": 0.47167035937309265,
"step": 119
},
{
"completion_length": 2067.3621826171875,
"epoch": 0.31108230719377833,
"grad_norm": 0.14856307208538055,
"kl": 0.0103302001953125,
"learning_rate": 8.83685127479982e-07,
"loss": 0.0497,
"reward": 0.5158610492944717,
"reward_std": 0.24829266592860222,
"rewards/improved_len_reward_dast": 0.5158610492944717,
"step": 120
},
{
"completion_length": 1695.1377410888672,
"epoch": 0.3136746597537265,
"grad_norm": 0.1789526492357254,
"kl": 0.00838470458984375,
"learning_rate": 8.809300748696173e-07,
"loss": 0.0452,
"reward": 0.4681314527988434,
"reward_std": 0.27835647389292717,
"rewards/improved_len_reward_dast": 0.4681314527988434,
"step": 121
},
{
"completion_length": 2345.4464111328125,
"epoch": 0.31626701231367466,
"grad_norm": 0.1702735722064972,
"kl": 0.01092529296875,
"learning_rate": 8.781477399704652e-07,
"loss": 0.0505,
"reward": 0.4150802828371525,
"reward_std": 0.2282225303351879,
"rewards/improved_len_reward_dast": 0.4150802828371525,
"step": 122
},
{
"completion_length": 2215.8545532226562,
"epoch": 0.3188593648736228,
"grad_norm": 0.1643056720495224,
"kl": 0.010467529296875,
"learning_rate": 8.753383521616902e-07,
"loss": -0.0006,
"reward": 0.4899800196290016,
"reward_std": 0.2781127095222473,
"rewards/improved_len_reward_dast": 0.4899800196290016,
"step": 123
},
{
"completion_length": 1792.7601928710938,
"epoch": 0.321451717433571,
"grad_norm": 0.18634077906608582,
"kl": 0.00847625732421875,
"learning_rate": 8.72502143052733e-07,
"loss": 0.0014,
"reward": 0.3171217106282711,
"reward_std": 0.27590419724583626,
"rewards/improved_len_reward_dast": 0.3171217106282711,
"step": 124
},
{
"completion_length": 1776.8545227050781,
"epoch": 0.32404406999351915,
"grad_norm": 0.12118836492300034,
"kl": 0.0086212158203125,
"learning_rate": 8.696393464642158e-07,
"loss": -0.0068,
"reward": 0.5544345825910568,
"reward_std": 0.24507181718945503,
"rewards/improved_len_reward_dast": 0.5544345825910568,
"step": 125
},
{
"completion_length": 1694.5203857421875,
"epoch": 0.32663642255346725,
"grad_norm": 0.1287376880645752,
"kl": 0.00778961181640625,
"learning_rate": 8.667501984086655e-07,
"loss": 0.007,
"reward": 0.5977945774793625,
"reward_std": 0.245724493637681,
"rewards/improved_len_reward_dast": 0.5977945774793625,
"step": 126
},
{
"completion_length": 1636.5458679199219,
"epoch": 0.3292287751134154,
"grad_norm": 0.14778032898902893,
"kl": 0.00799560546875,
"learning_rate": 8.638349370710573e-07,
"loss": 0.0288,
"reward": 0.4892159327864647,
"reward_std": 0.21941957622766495,
"rewards/improved_len_reward_dast": 0.4892159327864647,
"step": 127
},
{
"completion_length": 1714.7295532226562,
"epoch": 0.3318211276733636,
"grad_norm": 0.1856471598148346,
"kl": 0.00775909423828125,
"learning_rate": 8.608938027891775e-07,
"loss": -0.0209,
"reward": 0.49680083245038986,
"reward_std": 0.31941552460193634,
"rewards/improved_len_reward_dast": 0.49680083245038986,
"step": 128
},
{
"completion_length": 1622.4820861816406,
"epoch": 0.33441348023331174,
"grad_norm": 0.14166595041751862,
"kl": 0.00904083251953125,
"learning_rate": 8.579270380338107e-07,
"loss": 0.0349,
"reward": 0.5298355668783188,
"reward_std": 0.25498587638139725,
"rewards/improved_len_reward_dast": 0.5298355668783188,
"step": 129
},
{
"completion_length": 2226.5203552246094,
"epoch": 0.3370058327932599,
"grad_norm": 0.13678768277168274,
"kl": 0.010528564453125,
"learning_rate": 8.549348873887496e-07,
"loss": 0.0306,
"reward": 0.32516562566161156,
"reward_std": 0.2915949523448944,
"rewards/improved_len_reward_dast": 0.32516562566161156,
"step": 130
},
{
"completion_length": 1750.5968933105469,
"epoch": 0.339598185353208,
"grad_norm": 0.6094262003898621,
"kl": 0.01262664794921875,
"learning_rate": 8.519175975306312e-07,
"loss": 0.0175,
"reward": 0.34795505669899285,
"reward_std": 0.29810576513409615,
"rewards/improved_len_reward_dast": 0.34795505669899285,
"step": 131
},
{
"completion_length": 1810.3775024414062,
"epoch": 0.3421905379131562,
"grad_norm": 0.13672804832458496,
"kl": 0.0082855224609375,
"learning_rate": 8.48875417208601e-07,
"loss": 0.0123,
"reward": 0.5421003252267838,
"reward_std": 0.23054074123501778,
"rewards/improved_len_reward_dast": 0.5421003252267838,
"step": 132
},
{
"completion_length": 1827.4871826171875,
"epoch": 0.34478289047310434,
"grad_norm": 0.18760238587856293,
"kl": 0.00931549072265625,
"learning_rate": 8.458085972238048e-07,
"loss": -0.0378,
"reward": 0.3316265791654587,
"reward_std": 0.3142329826951027,
"rewards/improved_len_reward_dast": 0.3316265791654587,
"step": 133
},
{
"completion_length": 1851.31884765625,
"epoch": 0.3473752430330525,
"grad_norm": 0.17281284928321838,
"kl": 0.01065826416015625,
"learning_rate": 8.427173904087138e-07,
"loss": 0.0098,
"reward": 0.3921409696340561,
"reward_std": 0.3084140866994858,
"rewards/improved_len_reward_dast": 0.3921409696340561,
"step": 134
},
{
"completion_length": 2403.4004516601562,
"epoch": 0.34996759559300067,
"grad_norm": 0.12713146209716797,
"kl": 0.01275634765625,
"learning_rate": 8.396020516062794e-07,
"loss": 0.0099,
"reward": 0.3863501325249672,
"reward_std": 0.23984722048044205,
"rewards/improved_len_reward_dast": 0.3863501325249672,
"step": 135
},
{
"completion_length": 1436.4285278320312,
"epoch": 0.3525599481529488,
"grad_norm": 0.23622222244739532,
"kl": 0.00687408447265625,
"learning_rate": 8.364628376489242e-07,
"loss": 0.0785,
"reward": 0.6200843900442123,
"reward_std": 0.2323874980211258,
"rewards/improved_len_reward_dast": 0.6200843900442123,
"step": 136
},
{
"completion_length": 1340.3367004394531,
"epoch": 0.35515230071289694,
"grad_norm": 0.14602254331111908,
"kl": 0.0074920654296875,
"learning_rate": 8.333000073373685e-07,
"loss": 0.0062,
"reward": 0.5035427659749985,
"reward_std": 0.2632300853729248,
"rewards/improved_len_reward_dast": 0.5035427659749985,
"step": 137
},
{
"completion_length": 1822.9515075683594,
"epoch": 0.3577446532728451,
"grad_norm": 0.1399223655462265,
"kl": 0.0090789794921875,
"learning_rate": 8.301138214192945e-07,
"loss": 0.0151,
"reward": 0.45684105157852173,
"reward_std": 0.28661157563328743,
"rewards/improved_len_reward_dast": 0.45684105157852173,
"step": 138
},
{
"completion_length": 1932.7831115722656,
"epoch": 0.36033700583279327,
"grad_norm": 0.14497311413288116,
"kl": 0.010772705078125,
"learning_rate": 8.269045425678497e-07,
"loss": 0.026,
"reward": 0.5445379167795181,
"reward_std": 0.23734620586037636,
"rewards/improved_len_reward_dast": 0.5445379167795181,
"step": 139
},
{
"completion_length": 1724.9464111328125,
"epoch": 0.36292935839274143,
"grad_norm": 0.15263906121253967,
"kl": 0.00861358642578125,
"learning_rate": 8.236724353599918e-07,
"loss": 0.0175,
"reward": 0.667856439948082,
"reward_std": 0.23194020241498947,
"rewards/improved_len_reward_dast": 0.667856439948082,
"step": 140
},
{
"completion_length": 1628.3902893066406,
"epoch": 0.36552171095268954,
"grad_norm": 0.1453440636396408,
"kl": 0.00885009765625,
"learning_rate": 8.204177662546763e-07,
"loss": 0.0253,
"reward": 0.5715875178575516,
"reward_std": 0.20206843689084053,
"rewards/improved_len_reward_dast": 0.5715875178575516,
"step": 141
},
{
"completion_length": 1601.8902893066406,
"epoch": 0.3681140635126377,
"grad_norm": 0.24610204994678497,
"kl": 0.00897979736328125,
"learning_rate": 8.171408035708906e-07,
"loss": 0.077,
"reward": 0.5970557183027267,
"reward_std": 0.21772165596485138,
"rewards/improved_len_reward_dast": 0.5970557183027267,
"step": 142
},
{
"completion_length": 1380.9310913085938,
"epoch": 0.37070641607258586,
"grad_norm": 0.1606459617614746,
"kl": 0.0074005126953125,
"learning_rate": 8.138418174655323e-07,
"loss": 0.0208,
"reward": 0.6311784163117409,
"reward_std": 0.21403341740369797,
"rewards/improved_len_reward_dast": 0.6311784163117409,
"step": 143
},
{
"completion_length": 2226.7908325195312,
"epoch": 0.37329876863253403,
"grad_norm": 0.13883228600025177,
"kl": 0.0110626220703125,
"learning_rate": 8.105210799111366e-07,
"loss": 0.0192,
"reward": 0.4252306818962097,
"reward_std": 0.2911713309586048,
"rewards/improved_len_reward_dast": 0.4252306818962097,
"step": 144
},
{
"completion_length": 1767.211669921875,
"epoch": 0.3758911211924822,
"grad_norm": 0.13391607999801636,
"kl": 0.01000213623046875,
"learning_rate": 8.071788646734564e-07,
"loss": -0.0125,
"reward": 0.5215009152889252,
"reward_std": 0.20849771052598953,
"rewards/improved_len_reward_dast": 0.5215009152889252,
"step": 145
},
{
"completion_length": 1668.6147766113281,
"epoch": 0.37848347375243035,
"grad_norm": 0.14263834059238434,
"kl": 0.00905609130859375,
"learning_rate": 8.038154472888909e-07,
"loss": -0.0016,
"reward": 0.5347848311066628,
"reward_std": 0.2661595940589905,
"rewards/improved_len_reward_dast": 0.5347848311066628,
"step": 146
},
{
"completion_length": 1830.3596801757812,
"epoch": 0.38107582631237846,
"grad_norm": 0.17779265344142914,
"kl": 0.0116119384765625,
"learning_rate": 8.004311050417711e-07,
"loss": -0.0093,
"reward": 0.49393337965011597,
"reward_std": 0.2844499684870243,
"rewards/improved_len_reward_dast": 0.49393337965011597,
"step": 147
},
{
"completion_length": 1709.0025329589844,
"epoch": 0.3836681788723266,
"grad_norm": 0.15148715674877167,
"kl": 0.0105743408203125,
"learning_rate": 7.970261169414999e-07,
"loss": 0.0157,
"reward": 0.5047090724110603,
"reward_std": 0.2441636137664318,
"rewards/improved_len_reward_dast": 0.5047090724110603,
"step": 148
},
{
"completion_length": 2119.3775024414062,
"epoch": 0.3862605314322748,
"grad_norm": 0.22775354981422424,
"kl": 0.0129241943359375,
"learning_rate": 7.936007636995497e-07,
"loss": 0.0774,
"reward": 0.49651817977428436,
"reward_std": 0.2400597222149372,
"rewards/improved_len_reward_dast": 0.49651817977428436,
"step": 149
},
{
"completion_length": 1689.4004821777344,
"epoch": 0.38885288399222295,
"grad_norm": 0.15328077971935272,
"kl": 0.00994873046875,
"learning_rate": 7.901553277063213e-07,
"loss": 0.0078,
"reward": 0.3569689057767391,
"reward_std": 0.3229844532907009,
"rewards/improved_len_reward_dast": 0.3569689057767391,
"step": 150
},
{
"completion_length": 2073.864776611328,
"epoch": 0.3914452365521711,
"grad_norm": 0.19549565017223358,
"kl": 0.0137786865234375,
"learning_rate": 7.866900930078618e-07,
"loss": 0.0528,
"reward": 0.5197746828198433,
"reward_std": 0.24571574851870537,
"rewards/improved_len_reward_dast": 0.5197746828198433,
"step": 151
},
{
"completion_length": 1904.7984313964844,
"epoch": 0.3940375891121192,
"grad_norm": 0.16537566483020782,
"kl": 0.0112762451171875,
"learning_rate": 7.832053452824489e-07,
"loss": 0.0443,
"reward": 0.5653045251965523,
"reward_std": 0.26458077877759933,
"rewards/improved_len_reward_dast": 0.5653045251965523,
"step": 152
},
{
"completion_length": 1810.0535278320312,
"epoch": 0.3966299416720674,
"grad_norm": 0.1673276126384735,
"kl": 0.01409912109375,
"learning_rate": 7.797013718170384e-07,
"loss": 0.0361,
"reward": 0.4529588147997856,
"reward_std": 0.24421193450689316,
"rewards/improved_len_reward_dast": 0.4529588147997856,
"step": 153
},
{
"completion_length": 1624.7015075683594,
"epoch": 0.39922229423201555,
"grad_norm": 0.15807899832725525,
"kl": 0.011444091796875,
"learning_rate": 7.761784614835801e-07,
"loss": 0.014,
"reward": 0.4734058678150177,
"reward_std": 0.32842234522104263,
"rewards/improved_len_reward_dast": 0.4734058678150177,
"step": 154
},
{
"completion_length": 1876.540756225586,
"epoch": 0.4018146467919637,
"grad_norm": 0.18241144716739655,
"kl": 0.01232147216796875,
"learning_rate": 7.726369047152029e-07,
"loss": 0.0244,
"reward": 0.4645438566803932,
"reward_std": 0.2389094103127718,
"rewards/improved_len_reward_dast": 0.4645438566803932,
"step": 155
},
{
"completion_length": 1693.3367004394531,
"epoch": 0.4044069993519119,
"grad_norm": 0.17326728999614716,
"kl": 0.00902557373046875,
"learning_rate": 7.690769934822712e-07,
"loss": 0.0494,
"reward": 0.4986276477575302,
"reward_std": 0.2973395735025406,
"rewards/improved_len_reward_dast": 0.4986276477575302,
"step": 156
},
{
"completion_length": 1658.6785278320312,
"epoch": 0.40699935191186,
"grad_norm": 0.20593588054180145,
"kl": 0.0114288330078125,
"learning_rate": 7.654990212683142e-07,
"loss": -0.0131,
"reward": 0.5161425247788429,
"reward_std": 0.2771513797342777,
"rewards/improved_len_reward_dast": 0.5161425247788429,
"step": 157
},
{
"completion_length": 1728.2755126953125,
"epoch": 0.40959170447180815,
"grad_norm": 0.16088007390499115,
"kl": 0.01190185546875,
"learning_rate": 7.619032830458307e-07,
"loss": 0.0392,
"reward": 0.6053505837917328,
"reward_std": 0.22085025534033775,
"rewards/improved_len_reward_dast": 0.6053505837917328,
"step": 158
},
{
"completion_length": 2031.5408325195312,
"epoch": 0.4121840570317563,
"grad_norm": 0.1635725349187851,
"kl": 0.01397705078125,
"learning_rate": 7.582900752519723e-07,
"loss": -0.0071,
"reward": 0.5291006043553352,
"reward_std": 0.253378227353096,
"rewards/improved_len_reward_dast": 0.5291006043553352,
"step": 159
},
{
"completion_length": 2087.8060607910156,
"epoch": 0.4147764095917045,
"grad_norm": 0.19703420996665955,
"kl": 0.013885498046875,
"learning_rate": 7.546596957641031e-07,
"loss": 0.0142,
"reward": 0.4236784651875496,
"reward_std": 0.264580138027668,
"rewards/improved_len_reward_dast": 0.4236784651875496,
"step": 160
},
{
"completion_length": 1966.3953247070312,
"epoch": 0.41736876215165264,
"grad_norm": 0.17154966294765472,
"kl": 0.016204833984375,
"learning_rate": 7.510124438752432e-07,
"loss": 0.0021,
"reward": 0.5379416197538376,
"reward_std": 0.2562957741320133,
"rewards/improved_len_reward_dast": 0.5379416197538376,
"step": 161
},
{
"completion_length": 1525.1632080078125,
"epoch": 0.4199611147116008,
"grad_norm": 0.14776575565338135,
"kl": 0.01141357421875,
"learning_rate": 7.473486202693949e-07,
"loss": 0.0315,
"reward": 0.69241763651371,
"reward_std": 0.22519692406058311,
"rewards/improved_len_reward_dast": 0.69241763651371,
"step": 162
},
{
"completion_length": 1797.1249694824219,
"epoch": 0.4225534672715489,
"grad_norm": 0.17613932490348816,
"kl": 0.0154266357421875,
"learning_rate": 7.43668526996753e-07,
"loss": 0.0094,
"reward": 0.48756927251815796,
"reward_std": 0.300619401037693,
"rewards/improved_len_reward_dast": 0.48756927251815796,
"step": 163
},
{
"completion_length": 1866.8596801757812,
"epoch": 0.4251458198314971,
"grad_norm": 0.1625932902097702,
"kl": 0.015716552734375,
"learning_rate": 7.399724674488046e-07,
"loss": 0.0021,
"reward": 0.46739767491817474,
"reward_std": 0.24503038078546524,
"rewards/improved_len_reward_dast": 0.46739767491817474,
"step": 164
},
{
"completion_length": 1815.2423095703125,
"epoch": 0.42773817239144524,
"grad_norm": 0.23522265255451202,
"kl": 0.01446533203125,
"learning_rate": 7.36260746333316e-07,
"loss": 0.049,
"reward": 0.5157830119132996,
"reward_std": 0.1916877217590809,
"rewards/improved_len_reward_dast": 0.5157830119132996,
"step": 165
},
{
"completion_length": 1679.2806091308594,
"epoch": 0.4303305249513934,
"grad_norm": 0.1362147033214569,
"kl": 0.0125274658203125,
"learning_rate": 7.325336696492128e-07,
"loss": 0.0273,
"reward": 0.5556403249502182,
"reward_std": 0.22342020645737648,
"rewards/improved_len_reward_dast": 0.5556403249502182,
"step": 166
},
{
"completion_length": 1882.0254516601562,
"epoch": 0.43292287751134156,
"grad_norm": 0.17028290033340454,
"kl": 0.0140838623046875,
"learning_rate": 7.287915446613531e-07,
"loss": 0.0281,
"reward": 0.48191484808921814,
"reward_std": 0.2616124339401722,
"rewards/improved_len_reward_dast": 0.48191484808921814,
"step": 167
},
{
"completion_length": 2080.8468627929688,
"epoch": 0.43551523007128967,
"grad_norm": 0.16656753420829773,
"kl": 0.019134521484375,
"learning_rate": 7.250346798751953e-07,
"loss": -0.0133,
"reward": 0.4320894777774811,
"reward_std": 0.30758891999721527,
"rewards/improved_len_reward_dast": 0.4320894777774811,
"step": 168
},
{
"completion_length": 1616.6836700439453,
"epoch": 0.43810758263123784,
"grad_norm": 0.17435774207115173,
"kl": 0.0129547119140625,
"learning_rate": 7.212633850113662e-07,
"loss": -0.0002,
"reward": 0.43373018503189087,
"reward_std": 0.26510076597332954,
"rewards/improved_len_reward_dast": 0.43373018503189087,
"step": 169
},
{
"completion_length": 1309.869857788086,
"epoch": 0.440699935191186,
"grad_norm": 0.15860451757907867,
"kl": 0.011932373046875,
"learning_rate": 7.174779709801253e-07,
"loss": -0.0072,
"reward": 0.4780568554997444,
"reward_std": 0.2705870047211647,
"rewards/improved_len_reward_dast": 0.4780568554997444,
"step": 170
},
{
"completion_length": 1558.0025329589844,
"epoch": 0.44329228775113416,
"grad_norm": 0.12908661365509033,
"kl": 0.01216888427734375,
"learning_rate": 7.136787498557344e-07,
"loss": -0.0114,
"reward": 0.47240160405635834,
"reward_std": 0.30547885224223137,
"rewards/improved_len_reward_dast": 0.47240160405635834,
"step": 171
},
{
"completion_length": 1665.77294921875,
"epoch": 0.4458846403110823,
"grad_norm": 0.1525769829750061,
"kl": 0.0130615234375,
"learning_rate": 7.098660348507293e-07,
"loss": -0.0124,
"reward": 0.5375373288989067,
"reward_std": 0.25483621656894684,
"rewards/improved_len_reward_dast": 0.5375373288989067,
"step": 172
},
{
"completion_length": 1853.0943603515625,
"epoch": 0.44847699287103043,
"grad_norm": 0.13048604130744934,
"kl": 0.0130157470703125,
"learning_rate": 7.060401402900977e-07,
"loss": -0.0133,
"reward": 0.45648840069770813,
"reward_std": 0.2915825918316841,
"rewards/improved_len_reward_dast": 0.45648840069770813,
"step": 173
},
{
"completion_length": 1776.0254516601562,
"epoch": 0.4510693454309786,
"grad_norm": 0.14781691133975983,
"kl": 0.0131683349609375,
"learning_rate": 7.022013815853672e-07,
"loss": -0.0126,
"reward": 0.4387430027127266,
"reward_std": 0.301468089222908,
"rewards/improved_len_reward_dast": 0.4387430027127266,
"step": 174
},
{
"completion_length": 1614.7882385253906,
"epoch": 0.45366169799092676,
"grad_norm": 0.1574937105178833,
"kl": 0.0132598876953125,
"learning_rate": 6.983500752086006e-07,
"loss": 0.0277,
"reward": 0.5207101553678513,
"reward_std": 0.26092710718512535,
"rewards/improved_len_reward_dast": 0.5207101553678513,
"step": 175
},
{
"completion_length": 1572.3290405273438,
"epoch": 0.4562540505508749,
"grad_norm": 0.13418346643447876,
"kl": 0.0114593505859375,
"learning_rate": 6.94486538666307e-07,
"loss": 0.017,
"reward": 0.5103653743863106,
"reward_std": 0.24962808936834335,
"rewards/improved_len_reward_dast": 0.5103653743863106,
"step": 176
},
{
"completion_length": 1609.9362182617188,
"epoch": 0.4588464031108231,
"grad_norm": 0.15368600189685822,
"kl": 0.0113983154296875,
"learning_rate": 6.906110904732656e-07,
"loss": 0.0098,
"reward": 0.571323998272419,
"reward_std": 0.2758530154824257,
"rewards/improved_len_reward_dast": 0.571323998272419,
"step": 177
},
{
"completion_length": 2069.033172607422,
"epoch": 0.46143875567077125,
"grad_norm": 0.1310436725616455,
"kl": 0.012939453125,
"learning_rate": 6.867240501262666e-07,
"loss": 0.0214,
"reward": 0.537315845489502,
"reward_std": 0.21321317180991173,
"rewards/improved_len_reward_dast": 0.537315845489502,
"step": 178
},
{
"completion_length": 1546.3826293945312,
"epoch": 0.46403110823071936,
"grad_norm": 0.18392066657543182,
"kl": 0.011871337890625,
"learning_rate": 6.828257380777723e-07,
"loss": -0.0405,
"reward": 0.337811965495348,
"reward_std": 0.30249205976724625,
"rewards/improved_len_reward_dast": 0.337811965495348,
"step": 179
},
{
"completion_length": 1793.1861572265625,
"epoch": 0.4666234607906675,
"grad_norm": 0.13693319261074066,
"kl": 0.0158233642578125,
"learning_rate": 6.789164757094978e-07,
"loss": 0.0131,
"reward": 0.4841275066137314,
"reward_std": 0.28166862949728966,
"rewards/improved_len_reward_dast": 0.4841275066137314,
"step": 180
},
{
"completion_length": 1669.7703552246094,
"epoch": 0.4692158133506157,
"grad_norm": 0.26192930340766907,
"kl": 0.012939453125,
"learning_rate": 6.749965853059164e-07,
"loss": 0.0681,
"reward": 0.5609092861413956,
"reward_std": 0.2805982828140259,
"rewards/improved_len_reward_dast": 0.5609092861413956,
"step": 181
},
{
"completion_length": 1579.7372131347656,
"epoch": 0.47180816591056385,
"grad_norm": 0.13756124675273895,
"kl": 0.01165008544921875,
"learning_rate": 6.710663900276903e-07,
"loss": -0.0036,
"reward": 0.4818296991288662,
"reward_std": 0.23825621232390404,
"rewards/improved_len_reward_dast": 0.4818296991288662,
"step": 182
},
{
"completion_length": 1472.9540252685547,
"epoch": 0.474400518470512,
"grad_norm": 0.1440887451171875,
"kl": 0.01090240478515625,
"learning_rate": 6.671262138850274e-07,
"loss": 0.0178,
"reward": 0.6261176690459251,
"reward_std": 0.2099764347076416,
"rewards/improved_len_reward_dast": 0.6261176690459251,
"step": 183
},
{
"completion_length": 1391.3188171386719,
"epoch": 0.4769928710304601,
"grad_norm": 0.14985869824886322,
"kl": 0.010223388671875,
"learning_rate": 6.631763817109717e-07,
"loss": 0.036,
"reward": 0.6541325002908707,
"reward_std": 0.20582681149244308,
"rewards/improved_len_reward_dast": 0.6541325002908707,
"step": 184
},
{
"completion_length": 1469.6122436523438,
"epoch": 0.4795852235904083,
"grad_norm": 0.16654829680919647,
"kl": 0.0106353759765625,
"learning_rate": 6.592172191346218e-07,
"loss": 0.0004,
"reward": 0.5705170333385468,
"reward_std": 0.25396620109677315,
"rewards/improved_len_reward_dast": 0.5705170333385468,
"step": 185
},
{
"completion_length": 1663.2117004394531,
"epoch": 0.48217757615035645,
"grad_norm": 0.1463920623064041,
"kl": 0.0134735107421875,
"learning_rate": 6.552490525542864e-07,
"loss": 0.0037,
"reward": 0.5099420920014381,
"reward_std": 0.25713133439421654,
"rewards/improved_len_reward_dast": 0.5099420920014381,
"step": 186
},
{
"completion_length": 1614.6351318359375,
"epoch": 0.4847699287103046,
"grad_norm": 0.13490332663059235,
"kl": 0.012054443359375,
"learning_rate": 6.512722091105757e-07,
"loss": -0.0025,
"reward": 0.5862669795751572,
"reward_std": 0.257910817861557,
"rewards/improved_len_reward_dast": 0.5862669795751572,
"step": 187
},
{
"completion_length": 1392.2372131347656,
"epoch": 0.4873622812702528,
"grad_norm": 0.14285001158714294,
"kl": 0.0123291015625,
"learning_rate": 6.472870166594314e-07,
"loss": 0.0127,
"reward": 0.6144573241472244,
"reward_std": 0.2229880653321743,
"rewards/improved_len_reward_dast": 0.6144573241472244,
"step": 188
},
{
"completion_length": 1885.2652587890625,
"epoch": 0.4899546338302009,
"grad_norm": 0.14947953820228577,
"kl": 0.0140228271484375,
"learning_rate": 6.432938037450974e-07,
"loss": 0.0111,
"reward": 0.5071591883897781,
"reward_std": 0.23271573707461357,
"rewards/improved_len_reward_dast": 0.5071591883897781,
"step": 189
},
{
"completion_length": 1780.61474609375,
"epoch": 0.49254698639014904,
"grad_norm": 0.1921232044696808,
"kl": 0.0138397216796875,
"learning_rate": 6.392928995730352e-07,
"loss": 0.0336,
"reward": 0.5300878472626209,
"reward_std": 0.2751046009361744,
"rewards/improved_len_reward_dast": 0.5300878472626209,
"step": 190
},
{
"completion_length": 1370.4795837402344,
"epoch": 0.4951393389500972,
"grad_norm": 0.2351725697517395,
"kl": 0.01165008544921875,
"learning_rate": 6.352846339827826e-07,
"loss": 0.0776,
"reward": 0.5745302811264992,
"reward_std": 0.25346530973911285,
"rewards/improved_len_reward_dast": 0.5745302811264992,
"step": 191
},
{
"completion_length": 1441.2831420898438,
"epoch": 0.49773169151004537,
"grad_norm": 0.16980963945388794,
"kl": 0.013824462890625,
"learning_rate": 6.312693374207627e-07,
"loss": 0.0208,
"reward": 0.548950806260109,
"reward_std": 0.2456044964492321,
"rewards/improved_len_reward_dast": 0.548950806260109,
"step": 192
},
{
"completion_length": 1563.9183349609375,
"epoch": 0.5003240440699935,
"grad_norm": 0.16622929275035858,
"kl": 0.0142364501953125,
"learning_rate": 6.272473409130397e-07,
"loss": 0.0204,
"reward": 0.550769068300724,
"reward_std": 0.2540467455983162,
"rewards/improved_len_reward_dast": 0.550769068300724,
"step": 193
},
{
"completion_length": 1447.869857788086,
"epoch": 0.5029163966299417,
"grad_norm": 0.15316490828990936,
"kl": 0.010894775390625,
"learning_rate": 6.232189760380301e-07,
"loss": 0.0154,
"reward": 0.5217381715774536,
"reward_std": 0.28643129020929337,
"rewards/improved_len_reward_dast": 0.5217381715774536,
"step": 194
},
{
"completion_length": 1797.6096496582031,
"epoch": 0.5055087491898899,
"grad_norm": 0.17682208120822906,
"kl": 0.0141143798828125,
"learning_rate": 6.191845748991671e-07,
"loss": -0.0155,
"reward": 0.48948052898049355,
"reward_std": 0.21832110546529293,
"rewards/improved_len_reward_dast": 0.48948052898049355,
"step": 195
},
{
"completion_length": 1715.7448425292969,
"epoch": 0.508101101749838,
"grad_norm": 0.1485513597726822,
"kl": 0.0159454345703125,
"learning_rate": 6.151444700975203e-07,
"loss": 0.0098,
"reward": 0.599296048283577,
"reward_std": 0.2545859329402447,
"rewards/improved_len_reward_dast": 0.599296048283577,
"step": 196
},
{
"completion_length": 2129.3111877441406,
"epoch": 0.5106934543097861,
"grad_norm": 0.15262384712696075,
"kl": 0.017852783203125,
"learning_rate": 6.110989947043767e-07,
"loss": 0.0272,
"reward": 0.41421886533498764,
"reward_std": 0.3264440894126892,
"rewards/improved_len_reward_dast": 0.41421886533498764,
"step": 197
},
{
"completion_length": 1410.6402740478516,
"epoch": 0.5132858068697342,
"grad_norm": 0.15426945686340332,
"kl": 0.015106201171875,
"learning_rate": 6.070484822337816e-07,
"loss": 0.0002,
"reward": 0.5891918540000916,
"reward_std": 0.23263467848300934,
"rewards/improved_len_reward_dast": 0.5891918540000916,
"step": 198
},
{
"completion_length": 1698.915771484375,
"epoch": 0.5158781594296824,
"grad_norm": 0.2302210032939911,
"kl": 0.016326904296875,
"learning_rate": 6.029932666150431e-07,
"loss": 0.0565,
"reward": 0.5624502748250961,
"reward_std": 0.23056710511446,
"rewards/improved_len_reward_dast": 0.5624502748250961,
"step": 199
},
{
"completion_length": 1663.0356750488281,
"epoch": 0.5184705119896306,
"grad_norm": 0.15166351199150085,
"kl": 0.016143798828125,
"learning_rate": 5.989336821652029e-07,
"loss": 0.0351,
"reward": 0.5748374983668327,
"reward_std": 0.231033306568861,
"rewards/improved_len_reward_dast": 0.5748374983668327,
"step": 200
},
{
"completion_length": 1331.0382385253906,
"epoch": 0.5210628645495787,
"grad_norm": 0.21695953607559204,
"kl": 0.0140228271484375,
"learning_rate": 5.948700635614745e-07,
"loss": -0.0329,
"reward": 0.35667416942305863,
"reward_std": 0.29070717096328735,
"rewards/improved_len_reward_dast": 0.35667416942305863,
"step": 201
},
{
"completion_length": 1358.9055786132812,
"epoch": 0.5236552171095269,
"grad_norm": 0.18666747212409973,
"kl": 0.0137786865234375,
"learning_rate": 5.908027458136518e-07,
"loss": 0.0408,
"reward": 0.6661410629749298,
"reward_std": 0.20661123096942902,
"rewards/improved_len_reward_dast": 0.6661410629749298,
"step": 202
},
{
"completion_length": 1476.1377563476562,
"epoch": 0.5262475696694751,
"grad_norm": 0.13691206276416779,
"kl": 0.0115814208984375,
"learning_rate": 5.867320642364916e-07,
"loss": -0.0011,
"reward": 0.6029430329799652,
"reward_std": 0.2665823772549629,
"rewards/improved_len_reward_dast": 0.6029430329799652,
"step": 203
},
{
"completion_length": 1761.4412841796875,
"epoch": 0.5288399222294232,
"grad_norm": 0.16101093590259552,
"kl": 0.0147247314453125,
"learning_rate": 5.826583544220678e-07,
"loss": -0.003,
"reward": 0.4686589315533638,
"reward_std": 0.30455850437283516,
"rewards/improved_len_reward_dast": 0.4686589315533638,
"step": 204
},
{
"completion_length": 1345.9336395263672,
"epoch": 0.5314322747893714,
"grad_norm": 0.18765667080879211,
"kl": 0.0143280029296875,
"learning_rate": 5.78581952212107e-07,
"loss": 0.0427,
"reward": 0.5415176302194595,
"reward_std": 0.1892006602138281,
"rewards/improved_len_reward_dast": 0.5415176302194595,
"step": 205
},
{
"completion_length": 1723.2882385253906,
"epoch": 0.5340246273493195,
"grad_norm": 0.16698125004768372,
"kl": 0.015594482421875,
"learning_rate": 5.745031936702997e-07,
"loss": 0.0169,
"reward": 0.5265255123376846,
"reward_std": 0.21307621523737907,
"rewards/improved_len_reward_dast": 0.5265255123376846,
"step": 206
},
{
"completion_length": 1802.2014770507812,
"epoch": 0.5366169799092677,
"grad_norm": 0.1614968627691269,
"kl": 0.019561767578125,
"learning_rate": 5.704224150545956e-07,
"loss": 0.0221,
"reward": 0.4998108521103859,
"reward_std": 0.2493179477751255,
"rewards/improved_len_reward_dast": 0.4998108521103859,
"step": 207
},
{
"completion_length": 1295.1887664794922,
"epoch": 0.5392093324692158,
"grad_norm": 0.17712561786174774,
"kl": 0.01239013671875,
"learning_rate": 5.663399527894816e-07,
"loss": 0.0241,
"reward": 0.6530888006091118,
"reward_std": 0.20925156585872173,
"rewards/improved_len_reward_dast": 0.6530888006091118,
"step": 208
},
{
"completion_length": 1494.2805786132812,
"epoch": 0.5418016850291639,
"grad_norm": 0.13745726644992828,
"kl": 0.013885498046875,
"learning_rate": 5.622561434382467e-07,
"loss": 0.0127,
"reward": 0.48835258930921555,
"reward_std": 0.29268738254904747,
"rewards/improved_len_reward_dast": 0.48835258930921555,
"step": 209
},
{
"completion_length": 2105.05859375,
"epoch": 0.5443940375891121,
"grad_norm": 0.16275277733802795,
"kl": 0.019744873046875,
"learning_rate": 5.581713236752361e-07,
"loss": -0.0125,
"reward": 0.49209489673376083,
"reward_std": 0.22952783107757568,
"rewards/improved_len_reward_dast": 0.49209489673376083,
"step": 210
},
{
"completion_length": 1500.5382385253906,
"epoch": 0.5469863901490603,
"grad_norm": 0.1682814359664917,
"kl": 0.015899658203125,
"learning_rate": 5.540858302580934e-07,
"loss": 0.0207,
"reward": 0.5571364462375641,
"reward_std": 0.24001171812415123,
"rewards/improved_len_reward_dast": 0.5571364462375641,
"step": 211
},
{
"completion_length": 1923.9718933105469,
"epoch": 0.5495787427090084,
"grad_norm": 0.26020315289497375,
"kl": 0.0178680419921875,
"learning_rate": 5.5e-07,
"loss": -0.0687,
"reward": 0.46721208840608597,
"reward_std": 0.2817242816090584,
"rewards/improved_len_reward_dast": 0.46721208840608597,
"step": 212
},
{
"completion_length": 1559.1249694824219,
"epoch": 0.5521710952689566,
"grad_norm": 0.19159933924674988,
"kl": 0.0157012939453125,
"learning_rate": 5.459141697419066e-07,
"loss": -0.0163,
"reward": 0.6227085031569004,
"reward_std": 0.17799550667405128,
"rewards/improved_len_reward_dast": 0.6227085031569004,
"step": 213
},
{
"completion_length": 1373.073959350586,
"epoch": 0.5547634478289047,
"grad_norm": 0.13028506934642792,
"kl": 0.0119476318359375,
"learning_rate": 5.418286763247641e-07,
"loss": 0.0155,
"reward": 0.6422765105962753,
"reward_std": 0.20027055218815804,
"rewards/improved_len_reward_dast": 0.6422765105962753,
"step": 214
},
{
"completion_length": 1835.0841369628906,
"epoch": 0.5573558003888529,
"grad_norm": 0.19164609909057617,
"kl": 0.02093505859375,
"learning_rate": 5.377438565617532e-07,
"loss": -0.0,
"reward": 0.45971810445189476,
"reward_std": 0.3102139085531235,
"rewards/improved_len_reward_dast": 0.45971810445189476,
"step": 215
},
{
"completion_length": 2203.964324951172,
"epoch": 0.5599481529488011,
"grad_norm": 0.20495197176933289,
"kl": 0.023651123046875,
"learning_rate": 5.336600472105186e-07,
"loss": 0.0257,
"reward": 0.46121083945035934,
"reward_std": 0.2739550843834877,
"rewards/improved_len_reward_dast": 0.46121083945035934,
"step": 216
},
{
"completion_length": 1694.767822265625,
"epoch": 0.5625405055087492,
"grad_norm": 0.17180804908275604,
"kl": 0.0158233642578125,
"learning_rate": 5.295775849454045e-07,
"loss": 0.0214,
"reward": 0.4882684126496315,
"reward_std": 0.18393072485923767,
"rewards/improved_len_reward_dast": 0.4882684126496315,
"step": 217
},
{
"completion_length": 2042.6683044433594,
"epoch": 0.5651328580686974,
"grad_norm": 0.14189192652702332,
"kl": 0.0199127197265625,
"learning_rate": 5.254968063297003e-07,
"loss": 0.0033,
"reward": 0.4571044594049454,
"reward_std": 0.2295171208679676,
"rewards/improved_len_reward_dast": 0.4571044594049454,
"step": 218
},
{
"completion_length": 1910.2091064453125,
"epoch": 0.5677252106286454,
"grad_norm": 0.1559721827507019,
"kl": 0.0175628662109375,
"learning_rate": 5.214180477878931e-07,
"loss": 0.0287,
"reward": 0.5590897053480148,
"reward_std": 0.24075813218951225,
"rewards/improved_len_reward_dast": 0.5590897053480148,
"step": 219
},
{
"completion_length": 2258.568878173828,
"epoch": 0.5703175631885936,
"grad_norm": 0.14663958549499512,
"kl": 0.0192718505859375,
"learning_rate": 5.173416455779323e-07,
"loss": 0.0057,
"reward": 0.5018766671419144,
"reward_std": 0.26441601663827896,
"rewards/improved_len_reward_dast": 0.5018766671419144,
"step": 220
},
{
"completion_length": 1572.7805786132812,
"epoch": 0.5729099157485418,
"grad_norm": 0.18465618789196014,
"kl": 0.0140838623046875,
"learning_rate": 5.132679357635086e-07,
"loss": 0.0051,
"reward": 0.503139078617096,
"reward_std": 0.2186896838247776,
"rewards/improved_len_reward_dast": 0.503139078617096,
"step": 221
},
{
"completion_length": 1802.676025390625,
"epoch": 0.5755022683084899,
"grad_norm": 0.14844530820846558,
"kl": 0.0166778564453125,
"learning_rate": 5.091972541863481e-07,
"loss": 0.0324,
"reward": 0.5386637449264526,
"reward_std": 0.23858756944537163,
"rewards/improved_len_reward_dast": 0.5386637449264526,
"step": 222
},
{
"completion_length": 1240.0254821777344,
"epoch": 0.5780946208684381,
"grad_norm": 0.21698173880577087,
"kl": 0.01102447509765625,
"learning_rate": 5.051299364385257e-07,
"loss": 0.0494,
"reward": 0.6127595007419586,
"reward_std": 0.22652167454361916,
"rewards/improved_len_reward_dast": 0.6127595007419586,
"step": 223
},
{
"completion_length": 2182.4820861816406,
"epoch": 0.5806869734283863,
"grad_norm": 0.17258256673812866,
"kl": 0.022186279296875,
"learning_rate": 5.010663178347971e-07,
"loss": 0.0347,
"reward": 0.538253664970398,
"reward_std": 0.2642097547650337,
"rewards/improved_len_reward_dast": 0.538253664970398,
"step": 224
},
{
"completion_length": 1501.5688171386719,
"epoch": 0.5832793259883344,
"grad_norm": 0.1956610530614853,
"kl": 0.016021728515625,
"learning_rate": 4.970067333849568e-07,
"loss": 0.0345,
"reward": 0.49669354408979416,
"reward_std": 0.27108582854270935,
"rewards/improved_len_reward_dast": 0.49669354408979416,
"step": 225
},
{
"completion_length": 1730.1020202636719,
"epoch": 0.5858716785482826,
"grad_norm": 0.207245334982872,
"kl": 0.019317626953125,
"learning_rate": 4.929515177662182e-07,
"loss": 0.0365,
"reward": 0.5905517414212227,
"reward_std": 0.19489995390176773,
"rewards/improved_len_reward_dast": 0.5905517414212227,
"step": 226
},
{
"completion_length": 1841.9719543457031,
"epoch": 0.5884640311082308,
"grad_norm": 0.15803495049476624,
"kl": 0.0171661376953125,
"learning_rate": 4.889010052956233e-07,
"loss": 0.0201,
"reward": 0.5235611572861671,
"reward_std": 0.28989996388554573,
"rewards/improved_len_reward_dast": 0.5235611572861671,
"step": 227
},
{
"completion_length": 1340.9795837402344,
"epoch": 0.5910563836681789,
"grad_norm": 0.14726755023002625,
"kl": 0.0139312744140625,
"learning_rate": 4.848555299024798e-07,
"loss": 0.0361,
"reward": 0.6701846867799759,
"reward_std": 0.1808851771056652,
"rewards/improved_len_reward_dast": 0.6701846867799759,
"step": 228
},
{
"completion_length": 1694.4412841796875,
"epoch": 0.593648736228127,
"grad_norm": 0.1517164260149002,
"kl": 0.0196075439453125,
"learning_rate": 4.80815425100833e-07,
"loss": 0.0279,
"reward": 0.5193780064582825,
"reward_std": 0.2600921764969826,
"rewards/improved_len_reward_dast": 0.5193780064582825,
"step": 229
},
{
"completion_length": 1846.6199035644531,
"epoch": 0.5962410887880751,
"grad_norm": 0.14960968494415283,
"kl": 0.021148681640625,
"learning_rate": 4.7678102396196983e-07,
"loss": 0.0065,
"reward": 0.5484226644039154,
"reward_std": 0.2323933281004429,
"rewards/improved_len_reward_dast": 0.5484226644039154,
"step": 230
},
{
"completion_length": 1674.3673400878906,
"epoch": 0.5988334413480233,
"grad_norm": 0.17465737462043762,
"kl": 0.0171966552734375,
"learning_rate": 4.727526590869605e-07,
"loss": 0.0101,
"reward": 0.4662090986967087,
"reward_std": 0.21839703619480133,
"rewards/improved_len_reward_dast": 0.4662090986967087,
"step": 231
},
{
"completion_length": 1509.1658020019531,
"epoch": 0.6014257939079715,
"grad_norm": 0.2101481705904007,
"kl": 0.0194244384765625,
"learning_rate": 4.6873066257923735e-07,
"loss": 0.0111,
"reward": 0.41205941140651703,
"reward_std": 0.21029997244477272,
"rewards/improved_len_reward_dast": 0.41205941140651703,
"step": 232
},
{
"completion_length": 2161.2933044433594,
"epoch": 0.6040181464679196,
"grad_norm": 0.15703819692134857,
"kl": 0.02520751953125,
"learning_rate": 4.647153660172173e-07,
"loss": 0.0004,
"reward": 0.48261498659849167,
"reward_std": 0.23572781309485435,
"rewards/improved_len_reward_dast": 0.48261498659849167,
"step": 233
},
{
"completion_length": 2022.1300354003906,
"epoch": 0.6066104990278678,
"grad_norm": 0.26027029752731323,
"kl": 0.027191162109375,
"learning_rate": 4.607071004269647e-07,
"loss": 0.0521,
"reward": 0.5801032036542892,
"reward_std": 0.2363697662949562,
"rewards/improved_len_reward_dast": 0.5801032036542892,
"step": 234
},
{
"completion_length": 1732.1275329589844,
"epoch": 0.609202851587816,
"grad_norm": 0.16403397917747498,
"kl": 0.023712158203125,
"learning_rate": 4.567061962549025e-07,
"loss": 0.0162,
"reward": 0.5604917779564857,
"reward_std": 0.22574709728360176,
"rewards/improved_len_reward_dast": 0.5604917779564857,
"step": 235
},
{
"completion_length": 1861.7398071289062,
"epoch": 0.6117952041477641,
"grad_norm": 0.17224401235580444,
"kl": 0.02764892578125,
"learning_rate": 4.527129833405687e-07,
"loss": -0.0105,
"reward": 0.5203966200351715,
"reward_std": 0.24152075126767159,
"rewards/improved_len_reward_dast": 0.5203966200351715,
"step": 236
},
{
"completion_length": 1547.3724365234375,
"epoch": 0.6143875567077123,
"grad_norm": 0.23777875304222107,
"kl": 0.0208740234375,
"learning_rate": 4.4872779088942425e-07,
"loss": 0.0271,
"reward": 0.5496758297085762,
"reward_std": 0.27215462550520897,
"rewards/improved_len_reward_dast": 0.5496758297085762,
"step": 237
},
{
"completion_length": 1969.2167663574219,
"epoch": 0.6169799092676604,
"grad_norm": 0.17101754248142242,
"kl": 0.028717041015625,
"learning_rate": 4.447509474457135e-07,
"loss": 0.0279,
"reward": 0.614069253206253,
"reward_std": 0.22787783294916153,
"rewards/improved_len_reward_dast": 0.614069253206253,
"step": 238
},
{
"completion_length": 1983.8570861816406,
"epoch": 0.6195722618276086,
"grad_norm": 0.14745572209358215,
"kl": 0.027984619140625,
"learning_rate": 4.4078278086537823e-07,
"loss": 0.0075,
"reward": 0.5620970204472542,
"reward_std": 0.28037280961871147,
"rewards/improved_len_reward_dast": 0.5620970204472542,
"step": 239
},
{
"completion_length": 1501.4897766113281,
"epoch": 0.6221646143875567,
"grad_norm": 0.15315327048301697,
"kl": 0.0208740234375,
"learning_rate": 4.3682361828902846e-07,
"loss": 0.0168,
"reward": 0.5364163219928741,
"reward_std": 0.2769155353307724,
"rewards/improved_len_reward_dast": 0.5364163219928741,
"step": 240
},
{
"completion_length": 1720.4004516601562,
"epoch": 0.6247569669475048,
"grad_norm": 0.2516409158706665,
"kl": 0.025726318359375,
"learning_rate": 4.328737861149726e-07,
"loss": -0.0237,
"reward": 0.4434478208422661,
"reward_std": 0.27931295707821846,
"rewards/improved_len_reward_dast": 0.4434478208422661,
"step": 241
},
{
"completion_length": 1809.1555633544922,
"epoch": 0.627349319507453,
"grad_norm": 0.16826176643371582,
"kl": 0.0258331298828125,
"learning_rate": 4.289336099723098e-07,
"loss": 0.0143,
"reward": 0.5690242052078247,
"reward_std": 0.23354141414165497,
"rewards/improved_len_reward_dast": 0.5690242052078247,
"step": 242
},
{
"completion_length": 1983.2984313964844,
"epoch": 0.6299416720674011,
"grad_norm": 0.19760891795158386,
"kl": 0.0326995849609375,
"learning_rate": 4.250034146940834e-07,
"loss": 0.0445,
"reward": 0.5676752850413322,
"reward_std": 0.24079378694295883,
"rewards/improved_len_reward_dast": 0.5676752850413322,
"step": 243
},
{
"completion_length": 1628.5739440917969,
"epoch": 0.6325340246273493,
"grad_norm": 0.16681502759456635,
"kl": 0.0252532958984375,
"learning_rate": 4.210835242905023e-07,
"loss": 0.023,
"reward": 0.5827814638614655,
"reward_std": 0.23568623140454292,
"rewards/improved_len_reward_dast": 0.5827814638614655,
"step": 244
},
{
"completion_length": 1828.3596496582031,
"epoch": 0.6351263771872975,
"grad_norm": 0.18224835395812988,
"kl": 0.032196044921875,
"learning_rate": 4.1717426192222784e-07,
"loss": 0.0288,
"reward": 0.5939928889274597,
"reward_std": 0.19402909092605114,
"rewards/improved_len_reward_dast": 0.5939928889274597,
"step": 245
},
{
"completion_length": 2015.790771484375,
"epoch": 0.6377187297472456,
"grad_norm": 0.17992916703224182,
"kl": 0.03424072265625,
"learning_rate": 4.1327594987373347e-07,
"loss": -0.0046,
"reward": 0.4280674587935209,
"reward_std": 0.23897960036993027,
"rewards/improved_len_reward_dast": 0.4280674587935209,
"step": 246
},
{
"completion_length": 2058.4132385253906,
"epoch": 0.6403110823071938,
"grad_norm": 0.16108137369155884,
"kl": 0.032012939453125,
"learning_rate": 4.0938890952673443e-07,
"loss": 0.0148,
"reward": 0.4607112519443035,
"reward_std": 0.19015729054808617,
"rewards/improved_len_reward_dast": 0.4607112519443035,
"step": 247
},
{
"completion_length": 1944.2117004394531,
"epoch": 0.642903434867142,
"grad_norm": 0.17802801728248596,
"kl": 0.03271484375,
"learning_rate": 4.05513461333693e-07,
"loss": -0.017,
"reward": 0.47688183188438416,
"reward_std": 0.2750718258321285,
"rewards/improved_len_reward_dast": 0.47688183188438416,
"step": 248
},
{
"completion_length": 1828.591796875,
"epoch": 0.6454957874270901,
"grad_norm": 0.2043733447790146,
"kl": 0.0340576171875,
"learning_rate": 4.016499247913994e-07,
"loss": 0.0015,
"reward": 0.5244659259915352,
"reward_std": 0.22990256920456886,
"rewards/improved_len_reward_dast": 0.5244659259915352,
"step": 249
},
{
"completion_length": 1883.8673095703125,
"epoch": 0.6480881399870383,
"grad_norm": 0.1844264417886734,
"kl": 0.032318115234375,
"learning_rate": 3.977986184146328e-07,
"loss": 0.0399,
"reward": 0.5821568816900253,
"reward_std": 0.23736536875367165,
"rewards/improved_len_reward_dast": 0.5821568816900253,
"step": 250
},
{
"completion_length": 2136.216796875,
"epoch": 0.6506804925469863,
"grad_norm": 0.23143452405929565,
"kl": 0.03753662109375,
"learning_rate": 3.939598597099022e-07,
"loss": -0.0036,
"reward": 0.4462605491280556,
"reward_std": 0.31835515797138214,
"rewards/improved_len_reward_dast": 0.4462605491280556,
"step": 251
},
{
"completion_length": 1681.5816040039062,
"epoch": 0.6532728451069345,
"grad_norm": 0.21353423595428467,
"kl": 0.030914306640625,
"learning_rate": 3.9013396514927076e-07,
"loss": -0.0203,
"reward": 0.47858475893735886,
"reward_std": 0.27276671305298805,
"rewards/improved_len_reward_dast": 0.47858475893735886,
"step": 252
},
{
"completion_length": 1958.6862182617188,
"epoch": 0.6558651976668827,
"grad_norm": 0.13848742842674255,
"kl": 0.02813720703125,
"learning_rate": 3.8632125014426566e-07,
"loss": -0.0093,
"reward": 0.4058891125023365,
"reward_std": 0.2632647715508938,
"rewards/improved_len_reward_dast": 0.4058891125023365,
"step": 253
},
{
"completion_length": 2378.114715576172,
"epoch": 0.6584575502268308,
"grad_norm": 0.2316746562719345,
"kl": 0.04150390625,
"learning_rate": 3.8252202901987474e-07,
"loss": 0.0253,
"reward": 0.4882591739296913,
"reward_std": 0.23233528062701225,
"rewards/improved_len_reward_dast": 0.4882591739296913,
"step": 254
},
{
"completion_length": 1957.6759643554688,
"epoch": 0.661049902786779,
"grad_norm": 0.25321757793426514,
"kl": 0.036376953125,
"learning_rate": 3.7873661498863384e-07,
"loss": -0.0147,
"reward": 0.5560312643647194,
"reward_std": 0.25412745028734207,
"rewards/improved_len_reward_dast": 0.5560312643647194,
"step": 255
},
{
"completion_length": 1821.5433349609375,
"epoch": 0.6636422553467272,
"grad_norm": 0.18754082918167114,
"kl": 0.033843994140625,
"learning_rate": 3.7496532012480463e-07,
"loss": 0.0354,
"reward": 0.5669542029500008,
"reward_std": 0.21794036030769348,
"rewards/improved_len_reward_dast": 0.5669542029500008,
"step": 256
},
{
"completion_length": 1623.8086547851562,
"epoch": 0.6662346079066753,
"grad_norm": 0.23035286366939545,
"kl": 0.032135009765625,
"learning_rate": 3.7120845533864706e-07,
"loss": 0.0102,
"reward": 0.5488623678684235,
"reward_std": 0.25401103869080544,
"rewards/improved_len_reward_dast": 0.5488623678684235,
"step": 257
},
{
"completion_length": 2070.8800659179688,
"epoch": 0.6688269604666235,
"grad_norm": 0.22188010811805725,
"kl": 0.039306640625,
"learning_rate": 3.6746633035078723e-07,
"loss": -0.0121,
"reward": 0.417112834751606,
"reward_std": 0.21221662312746048,
"rewards/improved_len_reward_dast": 0.417112834751606,
"step": 258
},
{
"completion_length": 1561.369873046875,
"epoch": 0.6714193130265717,
"grad_norm": 0.17679612338542938,
"kl": 0.029083251953125,
"learning_rate": 3.63739253666684e-07,
"loss": 0.0025,
"reward": 0.5591792389750481,
"reward_std": 0.23877732083201408,
"rewards/improved_len_reward_dast": 0.5591792389750481,
"step": 259
},
{
"completion_length": 2174.785675048828,
"epoch": 0.6740116655865198,
"grad_norm": 0.27692729234695435,
"kl": 0.037506103515625,
"learning_rate": 3.6002753255119533e-07,
"loss": 0.0435,
"reward": 0.5408740267157555,
"reward_std": 0.3040400817990303,
"rewards/improved_len_reward_dast": 0.5408740267157555,
"step": 260
},
{
"completion_length": 1679.0331115722656,
"epoch": 0.6766040181464679,
"grad_norm": 0.2655293047428131,
"kl": 0.0357513427734375,
"learning_rate": 3.5633147300324706e-07,
"loss": 0.0546,
"reward": 0.5232817307114601,
"reward_std": 0.19237679801881313,
"rewards/improved_len_reward_dast": 0.5232817307114601,
"step": 261
},
{
"completion_length": 2010.9719543457031,
"epoch": 0.679196370706416,
"grad_norm": 0.16808085143566132,
"kl": 0.038818359375,
"learning_rate": 3.526513797306051e-07,
"loss": 0.0302,
"reward": 0.5936213284730911,
"reward_std": 0.19600553810596466,
"rewards/improved_len_reward_dast": 0.5936213284730911,
"step": 262
},
{
"completion_length": 2130.2651977539062,
"epoch": 0.6817887232663642,
"grad_norm": 0.2609139382839203,
"kl": 0.05194091796875,
"learning_rate": 3.489875561247568e-07,
"loss": 0.0577,
"reward": 0.5483756810426712,
"reward_std": 0.19722291082143784,
"rewards/improved_len_reward_dast": 0.5483756810426712,
"step": 263
},
{
"completion_length": 1698.2474365234375,
"epoch": 0.6843810758263124,
"grad_norm": 0.20713907480239868,
"kl": 0.035247802734375,
"learning_rate": 3.453403042358968e-07,
"loss": 0.0219,
"reward": 0.591205969452858,
"reward_std": 0.2349410019814968,
"rewards/improved_len_reward_dast": 0.591205969452858,
"step": 264
},
{
"completion_length": 1852.9157409667969,
"epoch": 0.6869734283862605,
"grad_norm": 0.1937071830034256,
"kl": 0.0416259765625,
"learning_rate": 3.417099247480277e-07,
"loss": 0.0277,
"reward": 0.5075777247548103,
"reward_std": 0.2149694226682186,
"rewards/improved_len_reward_dast": 0.5075777247548103,
"step": 265
},
{
"completion_length": 2136.966766357422,
"epoch": 0.6895657809462087,
"grad_norm": 0.2680908441543579,
"kl": 0.0557861328125,
"learning_rate": 3.3809671695416916e-07,
"loss": 0.0296,
"reward": 0.5373198315501213,
"reward_std": 0.2887946330010891,
"rewards/improved_len_reward_dast": 0.5373198315501213,
"step": 266
},
{
"completion_length": 1520.6912841796875,
"epoch": 0.6921581335061568,
"grad_norm": 0.21836334466934204,
"kl": 0.03875732421875,
"learning_rate": 3.345009787316859e-07,
"loss": 0.0328,
"reward": 0.6441917270421982,
"reward_std": 0.20870398730039597,
"rewards/improved_len_reward_dast": 0.6441917270421982,
"step": 267
},
{
"completion_length": 1714.9182891845703,
"epoch": 0.694750486066105,
"grad_norm": 0.2412445843219757,
"kl": 0.045013427734375,
"learning_rate": 3.309230065177289e-07,
"loss": 0.0413,
"reward": 0.6430572420358658,
"reward_std": 0.1806990448385477,
"rewards/improved_len_reward_dast": 0.6430572420358658,
"step": 268
},
{
"completion_length": 1760.8800354003906,
"epoch": 0.6973428386260532,
"grad_norm": 0.22807085514068604,
"kl": 0.05609130859375,
"learning_rate": 3.273630952847971e-07,
"loss": 0.0423,
"reward": 0.5424732938408852,
"reward_std": 0.2143792137503624,
"rewards/improved_len_reward_dast": 0.5424732938408852,
"step": 269
},
{
"completion_length": 1853.6504821777344,
"epoch": 0.6999351911860013,
"grad_norm": 0.181836798787117,
"kl": 0.0499267578125,
"learning_rate": 3.2382153851641996e-07,
"loss": 0.0181,
"reward": 0.444052018225193,
"reward_std": 0.18882366083562374,
"rewards/improved_len_reward_dast": 0.444052018225193,
"step": 270
},
{
"completion_length": 1881.66064453125,
"epoch": 0.7025275437459495,
"grad_norm": 0.27354660630226135,
"kl": 0.068359375,
"learning_rate": 3.202986281829616e-07,
"loss": 0.0349,
"reward": 0.517607145011425,
"reward_std": 0.27927474305033684,
"rewards/improved_len_reward_dast": 0.517607145011425,
"step": 271
},
{
"completion_length": 1876.6555480957031,
"epoch": 0.7051198963058976,
"grad_norm": 0.28521379828453064,
"kl": 0.05859375,
"learning_rate": 3.1679465471755106e-07,
"loss": 0.0206,
"reward": 0.4820089340209961,
"reward_std": 0.23564638569951057,
"rewards/improved_len_reward_dast": 0.4820089340209961,
"step": 272
},
{
"completion_length": 2040.4591979980469,
"epoch": 0.7077122488658457,
"grad_norm": 0.23997171223163605,
"kl": 0.08282470703125,
"learning_rate": 3.1330990699213824e-07,
"loss": 0.0225,
"reward": 0.5280723571777344,
"reward_std": 0.19220414385199547,
"rewards/improved_len_reward_dast": 0.5280723571777344,
"step": 273
},
{
"completion_length": 1857.6989440917969,
"epoch": 0.7103046014257939,
"grad_norm": 0.43727678060531616,
"kl": 0.06512451171875,
"learning_rate": 3.0984467229367885e-07,
"loss": -0.0214,
"reward": 0.4374289773404598,
"reward_std": 0.24020638316869736,
"rewards/improved_len_reward_dast": 0.4374289773404598,
"step": 274
},
{
"completion_length": 1751.9540100097656,
"epoch": 0.712896953985742,
"grad_norm": 0.3334012031555176,
"kl": 0.06158447265625,
"learning_rate": 3.063992363004503e-07,
"loss": 0.045,
"reward": 0.560393676161766,
"reward_std": 0.24407575279474258,
"rewards/improved_len_reward_dast": 0.560393676161766,
"step": 275
},
{
"completion_length": 1921.1402282714844,
"epoch": 0.7154893065456902,
"grad_norm": 0.3832853436470032,
"kl": 0.07147216796875,
"learning_rate": 3.0297388305850004e-07,
"loss": 0.0472,
"reward": 0.4454130306839943,
"reward_std": 0.2820102423429489,
"rewards/improved_len_reward_dast": 0.4454130306839943,
"step": 276
},
{
"completion_length": 1837.2091369628906,
"epoch": 0.7180816591056384,
"grad_norm": 0.3518737256526947,
"kl": 0.0821533203125,
"learning_rate": 2.9956889495822877e-07,
"loss": 0.012,
"reward": 0.4661199301481247,
"reward_std": 0.27521887794137,
"rewards/improved_len_reward_dast": 0.4661199301481247,
"step": 277
},
{
"completion_length": 1900.5535583496094,
"epoch": 0.7206740116655865,
"grad_norm": 0.43427199125289917,
"kl": 0.0863037109375,
"learning_rate": 2.961845527111091e-07,
"loss": 0.023,
"reward": 0.49635138362646103,
"reward_std": 0.19695542380213737,
"rewards/improved_len_reward_dast": 0.49635138362646103,
"step": 278
},
{
"completion_length": 1781.9693603515625,
"epoch": 0.7232663642255347,
"grad_norm": 0.4264023005962372,
"kl": 0.0863037109375,
"learning_rate": 2.9282113532654363e-07,
"loss": 0.0375,
"reward": 0.5367333218455315,
"reward_std": 0.2848246172070503,
"rewards/improved_len_reward_dast": 0.5367333218455315,
"step": 279
},
{
"completion_length": 1870.1810607910156,
"epoch": 0.7258587167854829,
"grad_norm": 0.4594426155090332,
"kl": 0.093994140625,
"learning_rate": 2.894789200888634e-07,
"loss": 0.0754,
"reward": 0.5816426277160645,
"reward_std": 0.22311532869935036,
"rewards/improved_len_reward_dast": 0.5816426277160645,
"step": 280
},
{
"completion_length": 1424.9234619140625,
"epoch": 0.728451069345431,
"grad_norm": 0.29910653829574585,
"kl": 0.07781982421875,
"learning_rate": 2.8615818253446766e-07,
"loss": 0.0199,
"reward": 0.6617397367954254,
"reward_std": 0.2196519523859024,
"rewards/improved_len_reward_dast": 0.6617397367954254,
"step": 281
},
{
"completion_length": 1413.9310913085938,
"epoch": 0.7310434219053791,
"grad_norm": 0.3942880928516388,
"kl": 0.0758056640625,
"learning_rate": 2.828591964291093e-07,
"loss": 0.0087,
"reward": 0.5287511795759201,
"reward_std": 0.24336805939674377,
"rewards/improved_len_reward_dast": 0.5287511795759201,
"step": 282
},
{
"completion_length": 1595.8367309570312,
"epoch": 0.7336357744653272,
"grad_norm": 0.3628610074520111,
"kl": 0.07489013671875,
"learning_rate": 2.7958223374532363e-07,
"loss": -0.0144,
"reward": 0.492396779358387,
"reward_std": 0.28066498413681984,
"rewards/improved_len_reward_dast": 0.492396779358387,
"step": 283
},
{
"completion_length": 1388.1887512207031,
"epoch": 0.7362281270252754,
"grad_norm": 0.463160902261734,
"kl": 0.06719970703125,
"learning_rate": 2.7632756464000835e-07,
"loss": 0.0589,
"reward": 0.6620587855577469,
"reward_std": 0.20764853432774544,
"rewards/improved_len_reward_dast": 0.6620587855577469,
"step": 284
},
{
"completion_length": 2089.030517578125,
"epoch": 0.7388204795852236,
"grad_norm": 0.416213721036911,
"kl": 0.13720703125,
"learning_rate": 2.730954574321503e-07,
"loss": 0.0295,
"reward": 0.35152027755975723,
"reward_std": 0.24749820679426193,
"rewards/improved_len_reward_dast": 0.35152027755975723,
"step": 285
},
{
"completion_length": 1603.8214111328125,
"epoch": 0.7414128321451717,
"grad_norm": 0.2952825725078583,
"kl": 0.1046142578125,
"learning_rate": 2.698861785807055e-07,
"loss": 0.0192,
"reward": 0.5497759729623795,
"reward_std": 0.30123934894800186,
"rewards/improved_len_reward_dast": 0.5497759729623795,
"step": 286
},
{
"completion_length": 1780.3035583496094,
"epoch": 0.7440051847051199,
"grad_norm": 0.49455901980400085,
"kl": 0.1126708984375,
"learning_rate": 2.6669999266263154e-07,
"loss": -0.0016,
"reward": 0.45189622789621353,
"reward_std": 0.2775086760520935,
"rewards/improved_len_reward_dast": 0.45189622789621353,
"step": 287
},
{
"completion_length": 1889.9999389648438,
"epoch": 0.7465975372650681,
"grad_norm": 0.3843936324119568,
"kl": 0.11334228515625,
"learning_rate": 2.635371623510758e-07,
"loss": 0.0216,
"reward": 0.4534267857670784,
"reward_std": 0.24446595832705498,
"rewards/improved_len_reward_dast": 0.4534267857670784,
"step": 288
},
{
"completion_length": 1606.8596649169922,
"epoch": 0.7491898898250162,
"grad_norm": 0.3675477206707001,
"kl": 0.1318359375,
"learning_rate": 2.6039794839372066e-07,
"loss": 0.0298,
"reward": 0.5441700667142868,
"reward_std": 0.23767928034067154,
"rewards/improved_len_reward_dast": 0.5441700667142868,
"step": 289
},
{
"completion_length": 1844.4744567871094,
"epoch": 0.7517822423849644,
"grad_norm": 0.3915606141090393,
"kl": 0.1278076171875,
"learning_rate": 2.5728260959128614e-07,
"loss": 0.01,
"reward": 0.44573093950748444,
"reward_std": 0.2749871090054512,
"rewards/improved_len_reward_dast": 0.44573093950748444,
"step": 290
},
{
"completion_length": 2282.3596801757812,
"epoch": 0.7543745949449125,
"grad_norm": 0.4575667977333069,
"kl": 0.18359375,
"learning_rate": 2.541914027761951e-07,
"loss": 0.0404,
"reward": 0.4888821840286255,
"reward_std": 0.25494180247187614,
"rewards/improved_len_reward_dast": 0.4888821840286255,
"step": 291
},
{
"completion_length": 1613.5662994384766,
"epoch": 0.7569669475048607,
"grad_norm": 0.30206117033958435,
"kl": 0.128662109375,
"learning_rate": 2.511245827913991e-07,
"loss": 0.0256,
"reward": 0.49646422639489174,
"reward_std": 0.22853870689868927,
"rewards/improved_len_reward_dast": 0.49646422639489174,
"step": 292
},
{
"completion_length": 1986.2856750488281,
"epoch": 0.7595593000648088,
"grad_norm": 0.5634958148002625,
"kl": 0.15277099609375,
"learning_rate": 2.4808240246936866e-07,
"loss": -0.0016,
"reward": 0.422063373029232,
"reward_std": 0.27426348254084587,
"rewards/improved_len_reward_dast": 0.422063373029232,
"step": 293
},
{
"completion_length": 1912.8239440917969,
"epoch": 0.7621516526247569,
"grad_norm": 0.649374783039093,
"kl": 0.14990234375,
"learning_rate": 2.450651126112504e-07,
"loss": 0.0626,
"reward": 0.5210420861840248,
"reward_std": 0.1957332007586956,
"rewards/improved_len_reward_dast": 0.5210420861840248,
"step": 294
},
{
"completion_length": 1617.318832397461,
"epoch": 0.7647440051847051,
"grad_norm": 0.4732544720172882,
"kl": 0.13031005859375,
"learning_rate": 2.4207296196618924e-07,
"loss": 0.0492,
"reward": 0.5095237344503403,
"reward_std": 0.18864410370588303,
"rewards/improved_len_reward_dast": 0.5095237344503403,
"step": 295
},
{
"completion_length": 1087.5356903076172,
"epoch": 0.7673363577446533,
"grad_norm": 0.2905370593070984,
"kl": 0.0693359375,
"learning_rate": 2.3910619721082253e-07,
"loss": 0.0114,
"reward": 0.6024390161037445,
"reward_std": 0.22132978588342667,
"rewards/improved_len_reward_dast": 0.6024390161037445,
"step": 296
},
{
"completion_length": 1602.8443603515625,
"epoch": 0.7699287103046014,
"grad_norm": 0.3845907151699066,
"kl": 0.1500244140625,
"learning_rate": 2.3616506292894282e-07,
"loss": 0.0355,
"reward": 0.5333931297063828,
"reward_std": 0.3087846711277962,
"rewards/improved_len_reward_dast": 0.5333931297063828,
"step": 297
},
{
"completion_length": 1856.6402587890625,
"epoch": 0.7725210628645496,
"grad_norm": 0.5214760899543762,
"kl": 0.2025146484375,
"learning_rate": 2.332498015913344e-07,
"loss": 0.0709,
"reward": 0.5482401996850967,
"reward_std": 0.21799317747354507,
"rewards/improved_len_reward_dast": 0.5482401996850967,
"step": 298
},
{
"completion_length": 1488.5025329589844,
"epoch": 0.7751134154244977,
"grad_norm": 0.43935874104499817,
"kl": 0.186279296875,
"learning_rate": 2.303606535357843e-07,
"loss": 0.0366,
"reward": 0.5733404159545898,
"reward_std": 0.21773215383291245,
"rewards/improved_len_reward_dast": 0.5733404159545898,
"step": 299
},
{
"completion_length": 1722.0892486572266,
"epoch": 0.7777057679844459,
"grad_norm": 0.6731678247451782,
"kl": 0.249267578125,
"learning_rate": 2.2749785694726685e-07,
"loss": 0.0367,
"reward": 0.42770911008119583,
"reward_std": 0.2604576535522938,
"rewards/improved_len_reward_dast": 0.42770911008119583,
"step": 300
},
{
"completion_length": 1631.4693603515625,
"epoch": 0.7802981205443941,
"grad_norm": 0.5669419765472412,
"kl": 0.2646484375,
"learning_rate": 2.2466164783830972e-07,
"loss": 0.0641,
"reward": 0.4630614146590233,
"reward_std": 0.23011888936161995,
"rewards/improved_len_reward_dast": 0.4630614146590233,
"step": 301
},
{
"completion_length": 2058.1912536621094,
"epoch": 0.7828904731043422,
"grad_norm": 1.057323694229126,
"kl": 0.357421875,
"learning_rate": 2.2185226002953483e-07,
"loss": 0.0775,
"reward": 0.37563329190015793,
"reward_std": 0.2304530180990696,
"rewards/improved_len_reward_dast": 0.37563329190015793,
"step": 302
},
{
"completion_length": 2198.5254516601562,
"epoch": 0.7854828256642904,
"grad_norm": 0.8592402338981628,
"kl": 0.4775390625,
"learning_rate": 2.1906992513038268e-07,
"loss": 0.074,
"reward": 0.27012719213962555,
"reward_std": 0.28526439890265465,
"rewards/improved_len_reward_dast": 0.27012719213962555,
"step": 303
},
{
"completion_length": 1901.4846496582031,
"epoch": 0.7880751782242384,
"grad_norm": 0.8431882858276367,
"kl": 0.42138671875,
"learning_rate": 2.1631487252001822e-07,
"loss": 0.0492,
"reward": 0.37709545344114304,
"reward_std": 0.25447146967053413,
"rewards/improved_len_reward_dast": 0.37709545344114304,
"step": 304
},
{
"completion_length": 2066.573974609375,
"epoch": 0.7906675307841866,
"grad_norm": 1.21139657497406,
"kl": 0.4541015625,
"learning_rate": 2.1358732932842032e-07,
"loss": 0.0979,
"reward": 0.31043257750570774,
"reward_std": 0.24911593459546566,
"rewards/improved_len_reward_dast": 0.31043257750570774,
"step": 305
},
{
"completion_length": 2245.244903564453,
"epoch": 0.7932598833441348,
"grad_norm": 1.6808669567108154,
"kl": 0.705078125,
"learning_rate": 2.1088752041765734e-07,
"loss": 0.0566,
"reward": 0.26184154860675335,
"reward_std": 0.23590726777911186,
"rewards/improved_len_reward_dast": 0.26184154860675335,
"step": 306
},
{
"completion_length": 1971.1198425292969,
"epoch": 0.7958522359040829,
"grad_norm": 1.2574543952941895,
"kl": 0.59619140625,
"learning_rate": 2.0821566836334847e-07,
"loss": 0.0527,
"reward": 0.32783937454223633,
"reward_std": 0.24020928516983986,
"rewards/improved_len_reward_dast": 0.32783937454223633,
"step": 307
},
{
"completion_length": 1597.5178527832031,
"epoch": 0.7984445884640311,
"grad_norm": 1.4473228454589844,
"kl": 0.477294921875,
"learning_rate": 2.0557199343631494e-07,
"loss": 0.1143,
"reward": 0.35970793664455414,
"reward_std": 0.24855320155620575,
"rewards/improved_len_reward_dast": 0.35970793664455414,
"step": 308
},
{
"completion_length": 2033.6224365234375,
"epoch": 0.8010369410239793,
"grad_norm": 1.6653419733047485,
"kl": 0.60107421875,
"learning_rate": 2.0295671358442033e-07,
"loss": 0.0711,
"reward": 0.32821540907025337,
"reward_std": 0.2933182083070278,
"rewards/improved_len_reward_dast": 0.32821540907025337,
"step": 309
},
{
"completion_length": 1561.040771484375,
"epoch": 0.8036292935839274,
"grad_norm": 1.075103759765625,
"kl": 0.4736328125,
"learning_rate": 2.0037004441460263e-07,
"loss": 0.0643,
"reward": 0.29576242342591286,
"reward_std": 0.282806184142828,
"rewards/improved_len_reward_dast": 0.29576242342591286,
"step": 310
},
{
"completion_length": 1718.183609008789,
"epoch": 0.8062216461438756,
"grad_norm": 1.504372477531433,
"kl": 0.502197265625,
"learning_rate": 1.9781219917509987e-07,
"loss": 0.027,
"reward": 0.3890268914401531,
"reward_std": 0.19467511773109436,
"rewards/improved_len_reward_dast": 0.3890268914401531,
"step": 311
},
{
"completion_length": 1843.2831420898438,
"epoch": 0.8088139987038238,
"grad_norm": 2.515340566635132,
"kl": 0.6044921875,
"learning_rate": 1.9528338873786882e-07,
"loss": 0.0131,
"reward": 0.2797587066888809,
"reward_std": 0.2669145464897156,
"rewards/improved_len_reward_dast": 0.2797587066888809,
"step": 312
},
{
"completion_length": 1955.7856750488281,
"epoch": 0.8114063512637719,
"grad_norm": 1.1451594829559326,
"kl": 0.521728515625,
"learning_rate": 1.9278382158120116e-07,
"loss": 0.0705,
"reward": 0.3171418644487858,
"reward_std": 0.32902878522872925,
"rewards/improved_len_reward_dast": 0.3171418644487858,
"step": 313
},
{
"completion_length": 1826.882568359375,
"epoch": 0.81399870382372,
"grad_norm": 1.4347947835922241,
"kl": 0.44482421875,
"learning_rate": 1.9031370377253574e-07,
"loss": 0.1087,
"reward": 0.43766431510448456,
"reward_std": 0.2934253178536892,
"rewards/improved_len_reward_dast": 0.43766431510448456,
"step": 314
},
{
"completion_length": 2287.155548095703,
"epoch": 0.8165910563836681,
"grad_norm": 2.921687126159668,
"kl": 0.751953125,
"learning_rate": 1.8787323895147052e-07,
"loss": 0.0098,
"reward": 0.13603791175410151,
"reward_std": 0.3099226616322994,
"rewards/improved_len_reward_dast": 0.13603791175410151,
"step": 315
},
{
"completion_length": 1757.688720703125,
"epoch": 0.8191834089436163,
"grad_norm": 1.0537687540054321,
"kl": 0.3984375,
"learning_rate": 1.8546262831297438e-07,
"loss": 0.0769,
"reward": 0.3936043158173561,
"reward_std": 0.23435594514012337,
"rewards/improved_len_reward_dast": 0.3936043158173561,
"step": 316
},
{
"completion_length": 2103.642791748047,
"epoch": 0.8217757615035645,
"grad_norm": 2.2986154556274414,
"kl": 0.72412109375,
"learning_rate": 1.8308207059079938e-07,
"loss": 0.0538,
"reward": 0.2378080729395151,
"reward_std": 0.301589660346508,
"rewards/improved_len_reward_dast": 0.2378080729395151,
"step": 317
},
{
"completion_length": 2209.165740966797,
"epoch": 0.8243681140635126,
"grad_norm": 1.0545653104782104,
"kl": 0.5751953125,
"learning_rate": 1.8073176204109837e-07,
"loss": 0.0822,
"reward": 0.36628346890211105,
"reward_std": 0.3119317665696144,
"rewards/improved_len_reward_dast": 0.36628346890211105,
"step": 318
},
{
"completion_length": 2171.931121826172,
"epoch": 0.8269604666234608,
"grad_norm": 1.807541847229004,
"kl": 0.59375,
"learning_rate": 1.7841189642624428e-07,
"loss": 0.0284,
"reward": 0.2682526409626007,
"reward_std": 0.29699693247675896,
"rewards/improved_len_reward_dast": 0.2682526409626007,
"step": 319
},
{
"completion_length": 2228.471893310547,
"epoch": 0.829552819183409,
"grad_norm": 1.075270652770996,
"kl": 0.5126953125,
"learning_rate": 1.7612266499885642e-07,
"loss": 0.0831,
"reward": 0.36243029683828354,
"reward_std": 0.28352705761790276,
"rewards/improved_len_reward_dast": 0.36243029683828354,
"step": 320
},
{
"completion_length": 1616.5943603515625,
"epoch": 0.8321451717433571,
"grad_norm": 1.301566243171692,
"kl": 0.39794921875,
"learning_rate": 1.7386425648603354e-07,
"loss": 0.0878,
"reward": 0.4413522332906723,
"reward_std": 0.24165164679288864,
"rewards/improved_len_reward_dast": 0.4413522332906723,
"step": 321
},
{
"completion_length": 1898.8468933105469,
"epoch": 0.8347375243033053,
"grad_norm": 1.221224308013916,
"kl": 0.501953125,
"learning_rate": 1.716368570737946e-07,
"loss": 0.0483,
"reward": 0.2525193989276886,
"reward_std": 0.3362556993961334,
"rewards/improved_len_reward_dast": 0.2525193989276886,
"step": 322
},
{
"completion_length": 2523.7422790527344,
"epoch": 0.8373298768632534,
"grad_norm": 1.103893756866455,
"kl": 0.5478515625,
"learning_rate": 1.6944065039173004e-07,
"loss": 0.0645,
"reward": 0.11981333699077368,
"reward_std": 0.3073917515575886,
"rewards/improved_len_reward_dast": 0.11981333699077368,
"step": 323
},
{
"completion_length": 2292.2525024414062,
"epoch": 0.8399222294232016,
"grad_norm": 1.1243617534637451,
"kl": 0.4267578125,
"learning_rate": 1.672758174978622e-07,
"loss": 0.0546,
"reward": 0.2594154104590416,
"reward_std": 0.2828039526939392,
"rewards/improved_len_reward_dast": 0.2594154104590416,
"step": 324
},
{
"completion_length": 1818.2958984375,
"epoch": 0.8425145819831497,
"grad_norm": 1.7014206647872925,
"kl": 0.4521484375,
"learning_rate": 1.6514253686371917e-07,
"loss": 0.1072,
"reward": 0.3174915425479412,
"reward_std": 0.27957041934132576,
"rewards/improved_len_reward_dast": 0.3174915425479412,
"step": 325
},
{
"completion_length": 1835.6147766113281,
"epoch": 0.8451069345430978,
"grad_norm": 0.6114805936813354,
"kl": 0.344482421875,
"learning_rate": 1.630409843596216e-07,
"loss": 0.0602,
"reward": 0.38836774975061417,
"reward_std": 0.27062665671110153,
"rewards/improved_len_reward_dast": 0.38836774975061417,
"step": 326
},
{
"completion_length": 1859.8290100097656,
"epoch": 0.847699287103046,
"grad_norm": 1.033449649810791,
"kl": 0.3984375,
"learning_rate": 1.609713332401831e-07,
"loss": 0.0335,
"reward": 0.35275041311979294,
"reward_std": 0.2643149308860302,
"rewards/improved_len_reward_dast": 0.35275041311979294,
"step": 327
},
{
"completion_length": 1828.1096496582031,
"epoch": 0.8502916396629941,
"grad_norm": 0.9170458912849426,
"kl": 0.41552734375,
"learning_rate": 1.5893375413002765e-07,
"loss": 0.0429,
"reward": 0.2639412134885788,
"reward_std": 0.26479368656873703,
"rewards/improved_len_reward_dast": 0.2639412134885788,
"step": 328
},
{
"completion_length": 2143.4591369628906,
"epoch": 0.8528839922229423,
"grad_norm": 0.4960505962371826,
"kl": 0.32421875,
"learning_rate": 1.569284150097226e-07,
"loss": 0.0477,
"reward": 0.34283383935689926,
"reward_std": 0.24718820676207542,
"rewards/improved_len_reward_dast": 0.34283383935689926,
"step": 329
},
{
"completion_length": 1974.7805786132812,
"epoch": 0.8554763447828905,
"grad_norm": 0.8014364838600159,
"kl": 0.52587890625,
"learning_rate": 1.5495548120193003e-07,
"loss": 0.0955,
"reward": 0.2745523639023304,
"reward_std": 0.3219694271683693,
"rewards/improved_len_reward_dast": 0.2745523639023304,
"step": 330
},
{
"completion_length": 1617.0892333984375,
"epoch": 0.8580686973428386,
"grad_norm": 1.7895324230194092,
"kl": 0.3525390625,
"learning_rate": 1.5301511535777784e-07,
"loss": 0.1257,
"reward": 0.43920181691646576,
"reward_std": 0.26625148952007294,
"rewards/improved_len_reward_dast": 0.43920181691646576,
"step": 331
},
{
"completion_length": 1910.3468627929688,
"epoch": 0.8606610499027868,
"grad_norm": 1.3269976377487183,
"kl": 0.4423828125,
"learning_rate": 1.5110747744345006e-07,
"loss": 0.0978,
"reward": 0.25987571477890015,
"reward_std": 0.32213833928108215,
"rewards/improved_len_reward_dast": 0.25987571477890015,
"step": 332
},
{
"completion_length": 1990.3570861816406,
"epoch": 0.863253402462735,
"grad_norm": 1.6192659139633179,
"kl": 0.510986328125,
"learning_rate": 1.4923272472699986e-07,
"loss": 0.0687,
"reward": 0.19616913609206676,
"reward_std": 0.2878040000796318,
"rewards/improved_len_reward_dast": 0.19616913609206676,
"step": 333
},
{
"completion_length": 1567.4055786132812,
"epoch": 0.8658457550226831,
"grad_norm": 1.5365605354309082,
"kl": 0.43603515625,
"learning_rate": 1.4739101176538274e-07,
"loss": 0.1329,
"reward": 0.21937411278486252,
"reward_std": 0.3014941178262234,
"rewards/improved_len_reward_dast": 0.21937411278486252,
"step": 334
},
{
"completion_length": 1955.3647766113281,
"epoch": 0.8684381075826313,
"grad_norm": 1.001995325088501,
"kl": 0.40283203125,
"learning_rate": 1.4558249039171639e-07,
"loss": 0.0949,
"reward": 0.25975842773914337,
"reward_std": 0.3084189146757126,
"rewards/improved_len_reward_dast": 0.25975842773914337,
"step": 335
},
{
"completion_length": 2266.614776611328,
"epoch": 0.8710304601425793,
"grad_norm": 1.8815863132476807,
"kl": 0.4033203125,
"learning_rate": 1.4380730970276195e-07,
"loss": 0.1972,
"reward": 0.2519003488123417,
"reward_std": 0.25883801840245724,
"rewards/improved_len_reward_dast": 0.2519003488123417,
"step": 336
},
{
"completion_length": 2940.5254516601562,
"epoch": 0.8736228127025275,
"grad_norm": 1.5017142295837402,
"kl": 0.7333984375,
"learning_rate": 1.420656160466333e-07,
"loss": 0.0465,
"reward": 0.07994039542973042,
"reward_std": 0.2474011294543743,
"rewards/improved_len_reward_dast": 0.07994039542973042,
"step": 337
},
{
"completion_length": 2536.4744567871094,
"epoch": 0.8762151652624757,
"grad_norm": 3.5930166244506836,
"kl": 0.65185546875,
"learning_rate": 1.4035755301073102e-07,
"loss": 0.2242,
"reward": 0.2070501446723938,
"reward_std": 0.2260904610157013,
"rewards/improved_len_reward_dast": 0.2070501446723938,
"step": 338
},
{
"completion_length": 2507.9974365234375,
"epoch": 0.8788075178224238,
"grad_norm": 2.0312681198120117,
"kl": 0.79931640625,
"learning_rate": 1.386832614099056e-07,
"loss": 0.2886,
"reward": 0.1947159543633461,
"reward_std": 0.25280311703681946,
"rewards/improved_len_reward_dast": 0.1947159543633461,
"step": 339
},
{
"completion_length": 2240.9642333984375,
"epoch": 0.881399870382372,
"grad_norm": 6.861503601074219,
"kl": 0.6796875,
"learning_rate": 1.3704287927484846e-07,
"loss": 0.4098,
"reward": 0.09058164700400084,
"reward_std": 0.22857186198234558,
"rewards/improved_len_reward_dast": 0.09058164700400084,
"step": 340
},
{
"completion_length": 2856.7295532226562,
"epoch": 0.8839922229423202,
"grad_norm": 1.7314866781234741,
"kl": 0.9296875,
"learning_rate": 1.3543654184071186e-07,
"loss": 0.2576,
"reward": 0.04280344722792506,
"reward_std": 0.23391348123550415,
"rewards/improved_len_reward_dast": 0.04280344722792506,
"step": 341
},
{
"completion_length": 3017.0203857421875,
"epoch": 0.8865845755022683,
"grad_norm": 2.1199285984039307,
"kl": 1.123046875,
"learning_rate": 1.3386438153596067e-07,
"loss": 0.259,
"reward": 0.03899642452597618,
"reward_std": 0.1816622130572796,
"rewards/improved_len_reward_dast": 0.03899642452597618,
"step": 342
},
{
"completion_length": 2673.2601318359375,
"epoch": 0.8891769280622165,
"grad_norm": 4.129951000213623,
"kl": 1.0849609375,
"learning_rate": 1.323265279714543e-07,
"loss": 0.4278,
"reward": 0.07075950875878334,
"reward_std": 0.17871661111712456,
"rewards/improved_len_reward_dast": 0.07075950875878334,
"step": 343
},
{
"completion_length": 2820.4183349609375,
"epoch": 0.8917692806221647,
"grad_norm": 3.029540777206421,
"kl": 1.125,
"learning_rate": 1.3082310792976202e-07,
"loss": 0.2883,
"reward": 0.1679403679445386,
"reward_std": 0.1663094200193882,
"rewards/improved_len_reward_dast": 0.1679403679445386,
"step": 344
},
{
"completion_length": 2832.7269897460938,
"epoch": 0.8943616331821128,
"grad_norm": 1.9915223121643066,
"kl": 1.0732421875,
"learning_rate": 1.293542453547102e-07,
"loss": 0.2825,
"reward": 0.09134133439511061,
"reward_std": 0.25903644412755966,
"rewards/improved_len_reward_dast": 0.09134133439511061,
"step": 345
},
{
"completion_length": 2574.1351318359375,
"epoch": 0.8969539857420609,
"grad_norm": 2.0541656017303467,
"kl": 1.0693359375,
"learning_rate": 1.279200613411642e-07,
"loss": 0.3294,
"reward": 0.09616942587308586,
"reward_std": 0.2244393788278103,
"rewards/improved_len_reward_dast": 0.09616942587308586,
"step": 346
},
{
"completion_length": 3264.4540405273438,
"epoch": 0.899546338302009,
"grad_norm": 2.7875149250030518,
"kl": 1.189453125,
"learning_rate": 1.2652067412504605e-07,
"loss": 0.1564,
"reward": 0.06414215068798512,
"reward_std": 0.20420588552951813,
"rewards/improved_len_reward_dast": 0.06414215068798512,
"step": 347
},
{
"completion_length": 2914.2907104492188,
"epoch": 0.9021386908619572,
"grad_norm": 6.438938617706299,
"kl": 1.1162109375,
"learning_rate": 1.251561990735859e-07,
"loss": 0.3186,
"reward": 0.14111983217298985,
"reward_std": 0.1892341412603855,
"rewards/improved_len_reward_dast": 0.14111983217298985,
"step": 348
},
{
"completion_length": 3092.1886596679688,
"epoch": 0.9047310434219054,
"grad_norm": 4.280767917633057,
"kl": 0.80224609375,
"learning_rate": 1.238267486758117e-07,
"loss": 0.1811,
"reward": 0.0037063490599393845,
"reward_std": 0.19519924372434616,
"rewards/improved_len_reward_dast": 0.0037063490599393845,
"step": 349
},
{
"completion_length": 3299.4285888671875,
"epoch": 0.9073233959818535,
"grad_norm": 1.7542351484298706,
"kl": 0.72265625,
"learning_rate": 1.2253243253327504e-07,
"loss": 0.2208,
"reward": 0.11870704032480717,
"reward_std": 0.20294193923473358,
"rewards/improved_len_reward_dast": 0.11870704032480717,
"step": 350
},
{
"completion_length": 2654.012725830078,
"epoch": 0.9099157485418017,
"grad_norm": 8.6357421875,
"kl": 0.44775390625,
"learning_rate": 1.212733573510154e-07,
"loss": 0.2941,
"reward": 0.256839819252491,
"reward_std": 0.22414838150143623,
"rewards/improved_len_reward_dast": 0.256839819252491,
"step": 351
},
{
"completion_length": 2815.5816040039062,
"epoch": 0.9125081011017498,
"grad_norm": 4.935699462890625,
"kl": 0.4931640625,
"learning_rate": 1.20049626928764e-07,
"loss": 0.3194,
"reward": 0.1734664011746645,
"reward_std": 0.22984974458813667,
"rewards/improved_len_reward_dast": 0.1734664011746645,
"step": 352
},
{
"completion_length": 3032.7295532226562,
"epoch": 0.915100453661698,
"grad_norm": 7.583729267120361,
"kl": 0.67822265625,
"learning_rate": 1.1886134215238539e-07,
"loss": 0.2992,
"reward": 0.13638373278081417,
"reward_std": 0.20758359506726265,
"rewards/improved_len_reward_dast": 0.13638373278081417,
"step": 353
},
{
"completion_length": 2698.2423095703125,
"epoch": 0.9176928062216462,
"grad_norm": 9.70632553100586,
"kl": 0.9833984375,
"learning_rate": 1.1770860098556122e-07,
"loss": 0.3735,
"reward": 0.1530514433979988,
"reward_std": 0.20933512970805168,
"rewards/improved_len_reward_dast": 0.1530514433979988,
"step": 354
},
{
"completion_length": 2930.4693908691406,
"epoch": 0.9202851587815943,
"grad_norm": 1.9580261707305908,
"kl": 1.13671875,
"learning_rate": 1.1659149846171314e-07,
"loss": 0.2547,
"reward": 0.1397750903852284,
"reward_std": 0.16655682772397995,
"rewards/improved_len_reward_dast": 0.1397750903852284,
"step": 355
},
{
"completion_length": 2562.2881774902344,
"epoch": 0.9228775113415425,
"grad_norm": 4.772850036621094,
"kl": 0.8603515625,
"learning_rate": 1.1551012667616889e-07,
"loss": 0.5187,
"reward": 0.17518793791532516,
"reward_std": 0.18449129536747932,
"rewards/improved_len_reward_dast": 0.17518793791532516,
"step": 356
},
{
"completion_length": 2741.387725830078,
"epoch": 0.9254698639014906,
"grad_norm": 1.7493088245391846,
"kl": 0.87744140625,
"learning_rate": 1.1446457477856933e-07,
"loss": 0.3361,
"reward": 0.11712268507108092,
"reward_std": 0.22804437577724457,
"rewards/improved_len_reward_dast": 0.11712268507108092,
"step": 357
},
{
"completion_length": 2930.1070556640625,
"epoch": 0.9280622164614387,
"grad_norm": 2.6250553131103516,
"kl": 1.537109375,
"learning_rate": 1.1345492896551908e-07,
"loss": 0.2382,
"reward": 0.12876404216513038,
"reward_std": 0.23319095373153687,
"rewards/improved_len_reward_dast": 0.12876404216513038,
"step": 358
},
{
"completion_length": 2673.6530151367188,
"epoch": 0.9306545690213869,
"grad_norm": 5.296914100646973,
"kl": 1.3232421875,
"learning_rate": 1.1248127247348025e-07,
"loss": 0.342,
"reward": 0.23917717207223177,
"reward_std": 0.22187871485948563,
"rewards/improved_len_reward_dast": 0.23917717207223177,
"step": 359
},
{
"completion_length": 2845.5509643554688,
"epoch": 0.933246921581335,
"grad_norm": 1.8034971952438354,
"kl": 1.294921875,
"learning_rate": 1.1154368557191032e-07,
"loss": 0.3034,
"reward": 0.11301134852692485,
"reward_std": 0.2030489146709442,
"rewards/improved_len_reward_dast": 0.11301134852692485,
"step": 360
},
{
"completion_length": 2302.813751220703,
"epoch": 0.9358392741412832,
"grad_norm": 1.5550919771194458,
"kl": 0.9931640625,
"learning_rate": 1.1064224555664489e-07,
"loss": 0.3347,
"reward": 0.16131599247455597,
"reward_std": 0.2146884724497795,
"rewards/improved_len_reward_dast": 0.16131599247455597,
"step": 361
},
{
"completion_length": 3010.4871215820312,
"epoch": 0.9384316267012314,
"grad_norm": 1.0812184810638428,
"kl": 1.267578125,
"learning_rate": 1.0977702674352485e-07,
"loss": 0.3206,
"reward": 0.08643259108066559,
"reward_std": 0.19253767281770706,
"rewards/improved_len_reward_dast": 0.08643259108066559,
"step": 362
},
{
"completion_length": 2673.1708374023438,
"epoch": 0.9410239792611795,
"grad_norm": 1.4644445180892944,
"kl": 1.01123046875,
"learning_rate": 1.0894810046227007e-07,
"loss": 0.3343,
"reward": 0.18486913572996855,
"reward_std": 0.2324334941804409,
"rewards/improved_len_reward_dast": 0.18486913572996855,
"step": 363
},
{
"completion_length": 2397.7474365234375,
"epoch": 0.9436163318211277,
"grad_norm": 3.365234851837158,
"kl": 0.8212890625,
"learning_rate": 1.0815553505059864e-07,
"loss": 0.3498,
"reward": 0.2783215790987015,
"reward_std": 0.20841009542346,
"rewards/improved_len_reward_dast": 0.2783215790987015,
"step": 364
},
{
"completion_length": 2809.4030151367188,
"epoch": 0.9462086843810759,
"grad_norm": 4.3384833335876465,
"kl": 0.8330078125,
"learning_rate": 1.0739939584859327e-07,
"loss": 0.2444,
"reward": 0.24181043915450573,
"reward_std": 0.2128814272582531,
"rewards/improved_len_reward_dast": 0.24181043915450573,
"step": 365
},
{
"completion_length": 2888.7474365234375,
"epoch": 0.948801036941024,
"grad_norm": 5.5493574142456055,
"kl": 0.7646484375,
"learning_rate": 1.066797451933144e-07,
"loss": 0.2898,
"reward": 0.20839058235287666,
"reward_std": 0.22839120030403137,
"rewards/improved_len_reward_dast": 0.20839058235287666,
"step": 366
},
{
"completion_length": 2496.9310302734375,
"epoch": 0.9513933895009722,
"grad_norm": 4.914091110229492,
"kl": 0.8203125,
"learning_rate": 1.0599664241366108e-07,
"loss": 0.2941,
"reward": 0.2661769837141037,
"reward_std": 0.26103585585951805,
"rewards/improved_len_reward_dast": 0.2661769837141037,
"step": 367
},
{
"completion_length": 2880.0280151367188,
"epoch": 0.9539857420609202,
"grad_norm": 3.833822011947632,
"kl": 0.83251953125,
"learning_rate": 1.0535014382547976e-07,
"loss": 0.2725,
"reward": 0.2633733693510294,
"reward_std": 0.2882365696132183,
"rewards/improved_len_reward_dast": 0.2633733693510294,
"step": 368
},
{
"completion_length": 2686.16064453125,
"epoch": 0.9565780946208684,
"grad_norm": 3.0906360149383545,
"kl": 1.05078125,
"learning_rate": 1.0474030272692176e-07,
"loss": 0.3183,
"reward": 0.21776283904910088,
"reward_std": 0.26876696199178696,
"rewards/improved_len_reward_dast": 0.21776283904910088,
"step": 369
},
{
"completion_length": 2501.721923828125,
"epoch": 0.9591704471808166,
"grad_norm": 1.4724576473236084,
"kl": 0.9521484375,
"learning_rate": 1.0416716939404906e-07,
"loss": 0.2768,
"reward": 0.24670540168881416,
"reward_std": 0.24352310225367546,
"rewards/improved_len_reward_dast": 0.24670540168881416,
"step": 370
},
{
"completion_length": 2630.0331420898438,
"epoch": 0.9617627997407647,
"grad_norm": 1.545382022857666,
"kl": 1.044921875,
"learning_rate": 1.0363079107668965e-07,
"loss": 0.2864,
"reward": 0.22240487672388554,
"reward_std": 0.22445869073271751,
"rewards/improved_len_reward_dast": 0.22240487672388554,
"step": 371
},
{
"completion_length": 2604.4744262695312,
"epoch": 0.9643551523007129,
"grad_norm": 1.856889009475708,
"kl": 1.2001953125,
"learning_rate": 1.03131211994542e-07,
"loss": 0.2622,
"reward": 0.16849582828581333,
"reward_std": 0.216482974588871,
"rewards/improved_len_reward_dast": 0.16849582828581333,
"step": 372
},
{
"completion_length": 2533.7601928710938,
"epoch": 0.9669475048606611,
"grad_norm": 1.1367889642715454,
"kl": 1.12841796875,
"learning_rate": 1.0266847333352986e-07,
"loss": 0.3456,
"reward": 0.2437375970184803,
"reward_std": 0.21505925431847572,
"rewards/improved_len_reward_dast": 0.2437375970184803,
"step": 373
},
{
"completion_length": 2314.323944091797,
"epoch": 0.9695398574206092,
"grad_norm": 1.5537844896316528,
"kl": 1.2490234375,
"learning_rate": 1.022426132424064e-07,
"loss": 0.3655,
"reward": 0.19233586266636848,
"reward_std": 0.21690138429403305,
"rewards/improved_len_reward_dast": 0.19233586266636848,
"step": 374
},
{
"completion_length": 2416.1275329589844,
"epoch": 0.9721322099805574,
"grad_norm": 1.4035214185714722,
"kl": 1.17578125,
"learning_rate": 1.0185366682960968e-07,
"loss": 0.3357,
"reward": 0.20540117495693266,
"reward_std": 0.23184461519122124,
"rewards/improved_len_reward_dast": 0.20540117495693266,
"step": 375
},
{
"completion_length": 2568.6044921875,
"epoch": 0.9747245625405055,
"grad_norm": 2.378544330596924,
"kl": 1.412109375,
"learning_rate": 1.015016661603677e-07,
"loss": 0.3565,
"reward": 0.17796143516898155,
"reward_std": 0.18820034340023994,
"rewards/improved_len_reward_dast": 0.17796143516898155,
"step": 376
},
{
"completion_length": 2732.6095581054688,
"epoch": 0.9773169151004537,
"grad_norm": 1.6996707916259766,
"kl": 1.330078125,
"learning_rate": 1.011866402540555e-07,
"loss": 0.2989,
"reward": 0.10426153149455786,
"reward_std": 0.20582210645079613,
"rewards/improved_len_reward_dast": 0.10426153149455786,
"step": 377
},
{
"completion_length": 2732.6937866210938,
"epoch": 0.9799092676604018,
"grad_norm": 1.3265814781188965,
"kl": 1.0888671875,
"learning_rate": 1.0090861508180229e-07,
"loss": 0.342,
"reward": 0.15975524485111237,
"reward_std": 0.20655079558491707,
"rewards/improved_len_reward_dast": 0.15975524485111237,
"step": 378
},
{
"completion_length": 2176.466766357422,
"epoch": 0.9825016202203499,
"grad_norm": 1.45902419090271,
"kl": 1.046875,
"learning_rate": 1.006676135643506e-07,
"loss": 0.4055,
"reward": 0.29564017802476883,
"reward_std": 0.22227967530488968,
"rewards/improved_len_reward_dast": 0.29564017802476883,
"step": 379
},
{
"completion_length": 2283.53564453125,
"epoch": 0.9850939727802981,
"grad_norm": 1.839316964149475,
"kl": 0.7939453125,
"learning_rate": 1.004636555701666e-07,
"loss": 0.2705,
"reward": 0.28889062255620956,
"reward_std": 0.24509106576442719,
"rewards/improved_len_reward_dast": 0.28889062255620956,
"step": 380
},
{
"completion_length": 2493.9769287109375,
"epoch": 0.9876863253402463,
"grad_norm": 1.2430649995803833,
"kl": 0.533203125,
"learning_rate": 1.0029675791380211e-07,
"loss": 0.2568,
"reward": 0.36698443442583084,
"reward_std": 0.2436152882874012,
"rewards/improved_len_reward_dast": 0.36698443442583084,
"step": 381
},
{
"completion_length": 2185.372344970703,
"epoch": 0.9902786779001944,
"grad_norm": 0.8913379907608032,
"kl": 0.5322265625,
"learning_rate": 1.0016693435450846e-07,
"loss": 0.2093,
"reward": 0.3222588375210762,
"reward_std": 0.28892357647418976,
"rewards/improved_len_reward_dast": 0.3222588375210762,
"step": 382
},
{
"completion_length": 2258.5382080078125,
"epoch": 0.9928710304601426,
"grad_norm": 0.7607825398445129,
"kl": 0.333251953125,
"learning_rate": 1.00074195595102e-07,
"loss": 0.14,
"reward": 0.42002584785223007,
"reward_std": 0.22813301160931587,
"rewards/improved_len_reward_dast": 0.42002584785223007,
"step": 383
},
{
"completion_length": 1829.7269897460938,
"epoch": 0.9954633830200907,
"grad_norm": 1.3492361307144165,
"kl": 0.322021484375,
"learning_rate": 1.0001854928108199e-07,
"loss": 0.2535,
"reward": 0.4132830575108528,
"reward_std": 0.2525113746523857,
"rewards/improved_len_reward_dast": 0.4132830575108528,
"step": 384
},
{
"completion_length": 1845.188705444336,
"epoch": 0.9980557355800389,
"grad_norm": 0.7738073468208313,
"kl": 0.290771484375,
"learning_rate": 1e-07,
"loss": 0.0939,
"reward": 0.41781602054834366,
"reward_std": 0.2655208185315132,
"rewards/improved_len_reward_dast": 0.41781602054834366,
"step": 385
},
{
"epoch": 0.9980557355800389,
"step": 385,
"total_flos": 0.0,
"train_loss": 0.051004564216343064,
"train_runtime": 55695.085,
"train_samples_per_second": 0.194,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1,
"max_steps": 385,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 14,
"trial_name": null,
"trial_params": null
}