Kadins's picture
Model save
16e7ac5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9980557355800389,
"eval_steps": 500,
"global_step": 385,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1848.5458984375,
"epoch": 0.002592352559948153,
"grad_norm": 0.15340697800867806,
"kl": 0.0,
"learning_rate": 2.564102564102564e-08,
"loss": 0.0187,
"reward": 0.5978657901287079,
"reward_std": 0.2931807413697243,
"rewards/improved_len_reward_dast": 0.5978657901287079,
"step": 1
},
{
"completion_length": 2130.4540100097656,
"epoch": 0.005184705119896306,
"grad_norm": 0.19427816311387766,
"kl": 0.0,
"learning_rate": 5.128205128205128e-08,
"loss": 0.0499,
"reward": 0.3565452806651592,
"reward_std": 0.20069235190749168,
"rewards/improved_len_reward_dast": 0.3565452806651592,
"step": 2
},
{
"completion_length": 2029.7754821777344,
"epoch": 0.007777057679844459,
"grad_norm": 0.15482062458259815,
"kl": 0.00014066696166992188,
"learning_rate": 7.692307692307692e-08,
"loss": 0.0029,
"reward": 0.36762307211756706,
"reward_std": 0.26278356462717056,
"rewards/improved_len_reward_dast": 0.36762307211756706,
"step": 3
},
{
"completion_length": 2113.341796875,
"epoch": 0.010369410239792612,
"grad_norm": 0.14048722076875736,
"kl": 0.00012814998626708984,
"learning_rate": 1.0256410256410256e-07,
"loss": -0.0221,
"reward": 0.48202627897262573,
"reward_std": 0.3225458636879921,
"rewards/improved_len_reward_dast": 0.48202627897262573,
"step": 4
},
{
"completion_length": 1881.0356750488281,
"epoch": 0.012961762799740765,
"grad_norm": 0.15612866238173184,
"kl": 0.00012302398681640625,
"learning_rate": 1.2820512820512818e-07,
"loss": -0.0027,
"reward": 0.52305668592453,
"reward_std": 0.21705364808440208,
"rewards/improved_len_reward_dast": 0.52305668592453,
"step": 5
},
{
"completion_length": 2154.0663146972656,
"epoch": 0.015554115359688918,
"grad_norm": 0.15018629000790815,
"kl": 0.00014281272888183594,
"learning_rate": 1.5384615384615385e-07,
"loss": -0.0153,
"reward": 0.39995063841342926,
"reward_std": 0.2720435969531536,
"rewards/improved_len_reward_dast": 0.39995063841342926,
"step": 6
},
{
"completion_length": 1875.1071472167969,
"epoch": 0.01814646791963707,
"grad_norm": 0.17127136002254692,
"kl": 0.0001201629638671875,
"learning_rate": 1.7948717948717948e-07,
"loss": 0.0249,
"reward": 0.3945396225899458,
"reward_std": 0.25139790773391724,
"rewards/improved_len_reward_dast": 0.3945396225899458,
"step": 7
},
{
"completion_length": 1811.3316040039062,
"epoch": 0.020738820479585224,
"grad_norm": 0.17118361086335854,
"kl": 0.00010824203491210938,
"learning_rate": 2.0512820512820512e-07,
"loss": -0.0186,
"reward": 0.4574318379163742,
"reward_std": 0.2575865164399147,
"rewards/improved_len_reward_dast": 0.4574318379163742,
"step": 8
},
{
"completion_length": 2194.7601928710938,
"epoch": 0.023331173039533377,
"grad_norm": 0.16326988555433258,
"kl": 0.00013935565948486328,
"learning_rate": 2.3076923076923078e-07,
"loss": 0.0077,
"reward": 0.34908392280340195,
"reward_std": 0.2770259566605091,
"rewards/improved_len_reward_dast": 0.34908392280340195,
"step": 9
},
{
"completion_length": 1998.94384765625,
"epoch": 0.02592352559948153,
"grad_norm": 0.170741007261295,
"kl": 0.00012493133544921875,
"learning_rate": 2.5641025641025636e-07,
"loss": 0.0192,
"reward": 0.4179102033376694,
"reward_std": 0.26221491396427155,
"rewards/improved_len_reward_dast": 0.4179102033376694,
"step": 10
},
{
"completion_length": 2219.8724365234375,
"epoch": 0.028515878159429683,
"grad_norm": 0.17546980641506144,
"kl": 0.0001461505889892578,
"learning_rate": 2.8205128205128203e-07,
"loss": 0.0316,
"reward": 0.29684413131326437,
"reward_std": 0.31361983716487885,
"rewards/improved_len_reward_dast": 0.29684413131326437,
"step": 11
},
{
"completion_length": 2014.03564453125,
"epoch": 0.031108230719377836,
"grad_norm": 0.1541338261942444,
"kl": 0.00012749433517456055,
"learning_rate": 3.076923076923077e-07,
"loss": -0.0135,
"reward": 0.32144954474642873,
"reward_std": 0.30298536643385887,
"rewards/improved_len_reward_dast": 0.32144954474642873,
"step": 12
},
{
"completion_length": 2028.5969543457031,
"epoch": 0.033700583279325985,
"grad_norm": 0.19174062151927665,
"kl": 0.00013196468353271484,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0178,
"reward": 0.31833722069859505,
"reward_std": 0.25612180307507515,
"rewards/improved_len_reward_dast": 0.31833722069859505,
"step": 13
},
{
"completion_length": 2096.3060607910156,
"epoch": 0.03629293583927414,
"grad_norm": 0.14669329107868662,
"kl": 0.00011658668518066406,
"learning_rate": 3.5897435897435896e-07,
"loss": 0.0049,
"reward": 0.4347623288631439,
"reward_std": 0.21591071039438248,
"rewards/improved_len_reward_dast": 0.4347623288631439,
"step": 14
},
{
"completion_length": 1500.5254821777344,
"epoch": 0.03888528839922229,
"grad_norm": 0.17776360380978226,
"kl": 7.94529914855957e-05,
"learning_rate": 3.8461538461538463e-07,
"loss": 0.0382,
"reward": 0.4248454347252846,
"reward_std": 0.2069440335035324,
"rewards/improved_len_reward_dast": 0.4248454347252846,
"step": 15
},
{
"completion_length": 1272.0509948730469,
"epoch": 0.04147764095917045,
"grad_norm": 0.1784145917886179,
"kl": 9.012222290039062e-05,
"learning_rate": 4.1025641025641024e-07,
"loss": 0.0089,
"reward": 0.5397656932473183,
"reward_std": 0.2598051242530346,
"rewards/improved_len_reward_dast": 0.5397656932473183,
"step": 16
},
{
"completion_length": 2013.8724365234375,
"epoch": 0.0440699935191186,
"grad_norm": 0.1907346351579617,
"kl": 0.0001175999641418457,
"learning_rate": 4.358974358974359e-07,
"loss": 0.0611,
"reward": 0.3499421738088131,
"reward_std": 0.33140237629413605,
"rewards/improved_len_reward_dast": 0.3499421738088131,
"step": 17
},
{
"completion_length": 1330.7244567871094,
"epoch": 0.046662346079066754,
"grad_norm": 0.2178582705000707,
"kl": 7.390975952148438e-05,
"learning_rate": 4.6153846153846156e-07,
"loss": 0.0745,
"reward": 0.380832314491272,
"reward_std": 0.2622619494795799,
"rewards/improved_len_reward_dast": 0.380832314491272,
"step": 18
},
{
"completion_length": 1699.9285888671875,
"epoch": 0.0492546986390149,
"grad_norm": 0.19343172432969366,
"kl": 0.0001125335693359375,
"learning_rate": 4.871794871794871e-07,
"loss": 0.0827,
"reward": 0.42683304101228714,
"reward_std": 0.3005821108818054,
"rewards/improved_len_reward_dast": 0.42683304101228714,
"step": 19
},
{
"completion_length": 1751.9743957519531,
"epoch": 0.05184705119896306,
"grad_norm": 0.16168658590608048,
"kl": 0.00012385845184326172,
"learning_rate": 5.128205128205127e-07,
"loss": -0.0121,
"reward": 0.2530975602567196,
"reward_std": 0.38571304827928543,
"rewards/improved_len_reward_dast": 0.2530975602567196,
"step": 20
},
{
"completion_length": 2237.551025390625,
"epoch": 0.05443940375891121,
"grad_norm": 0.1689710162473967,
"kl": 0.00014531612396240234,
"learning_rate": 5.384615384615384e-07,
"loss": 0.0204,
"reward": 0.2948920242488384,
"reward_std": 0.298846572637558,
"rewards/improved_len_reward_dast": 0.2948920242488384,
"step": 21
},
{
"completion_length": 1934.6530151367188,
"epoch": 0.057031756318859365,
"grad_norm": 0.19901186477459418,
"kl": 0.00010859966278076172,
"learning_rate": 5.641025641025641e-07,
"loss": 0.0711,
"reward": 0.3974427357316017,
"reward_std": 0.32176483422517776,
"rewards/improved_len_reward_dast": 0.3974427357316017,
"step": 22
},
{
"completion_length": 1771.6071166992188,
"epoch": 0.059624108878807515,
"grad_norm": 0.15341330162436806,
"kl": 9.715557098388672e-05,
"learning_rate": 5.897435897435898e-07,
"loss": 0.0068,
"reward": 0.5254772454500198,
"reward_std": 0.21970795094966888,
"rewards/improved_len_reward_dast": 0.5254772454500198,
"step": 23
},
{
"completion_length": 1626.7601623535156,
"epoch": 0.06221646143875567,
"grad_norm": 0.17123059985482156,
"kl": 0.00012636184692382812,
"learning_rate": 6.153846153846154e-07,
"loss": 0.0005,
"reward": 0.37653250247240067,
"reward_std": 0.3122313618659973,
"rewards/improved_len_reward_dast": 0.37653250247240067,
"step": 24
},
{
"completion_length": 2203.3570861816406,
"epoch": 0.06480881399870382,
"grad_norm": 0.14530311020475567,
"kl": 0.00015342235565185547,
"learning_rate": 6.410256410256411e-07,
"loss": 0.0001,
"reward": 0.3898318260908127,
"reward_std": 0.2564953900873661,
"rewards/improved_len_reward_dast": 0.3898318260908127,
"step": 25
},
{
"completion_length": 1814.1887664794922,
"epoch": 0.06740116655865197,
"grad_norm": 0.16879506578340142,
"kl": 0.00012159347534179688,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0144,
"reward": 0.41976478695869446,
"reward_std": 0.28917887061834335,
"rewards/improved_len_reward_dast": 0.41976478695869446,
"step": 26
},
{
"completion_length": 1796.790771484375,
"epoch": 0.06999351911860013,
"grad_norm": 0.1906922337651447,
"kl": 0.0001188516616821289,
"learning_rate": 6.923076923076922e-07,
"loss": 0.0678,
"reward": 0.4590848907828331,
"reward_std": 0.2338687926530838,
"rewards/improved_len_reward_dast": 0.4590848907828331,
"step": 27
},
{
"completion_length": 1638.7346801757812,
"epoch": 0.07258587167854828,
"grad_norm": 0.19116566970185705,
"kl": 0.00011014938354492188,
"learning_rate": 7.179487179487179e-07,
"loss": 0.0404,
"reward": 0.41585223004221916,
"reward_std": 0.2356470599770546,
"rewards/improved_len_reward_dast": 0.41585223004221916,
"step": 28
},
{
"completion_length": 1974.6121826171875,
"epoch": 0.07517822423849643,
"grad_norm": 0.16589370632623032,
"kl": 0.0001277923583984375,
"learning_rate": 7.435897435897435e-07,
"loss": -0.0039,
"reward": 0.4259794130921364,
"reward_std": 0.2807146720588207,
"rewards/improved_len_reward_dast": 0.4259794130921364,
"step": 29
},
{
"completion_length": 1697.2346496582031,
"epoch": 0.07777057679844458,
"grad_norm": 0.15079867785526088,
"kl": 9.143352508544922e-05,
"learning_rate": 7.692307692307693e-07,
"loss": -0.034,
"reward": 0.29869329556822777,
"reward_std": 0.2533705197274685,
"rewards/improved_len_reward_dast": 0.29869329556822777,
"step": 30
},
{
"completion_length": 2224.2754516601562,
"epoch": 0.08036292935839275,
"grad_norm": 0.15714058382235824,
"kl": 0.00016355514526367188,
"learning_rate": 7.948717948717948e-07,
"loss": 0.0299,
"reward": 0.5116054937243462,
"reward_std": 0.26772793754935265,
"rewards/improved_len_reward_dast": 0.5116054937243462,
"step": 31
},
{
"completion_length": 1872.5101928710938,
"epoch": 0.0829552819183409,
"grad_norm": 0.17779038577227782,
"kl": 0.00011873245239257812,
"learning_rate": 8.205128205128205e-07,
"loss": 0.041,
"reward": 0.3215858917683363,
"reward_std": 0.27255750447511673,
"rewards/improved_len_reward_dast": 0.3215858917683363,
"step": 32
},
{
"completion_length": 1810.0560913085938,
"epoch": 0.08554763447828904,
"grad_norm": 0.18730102696321338,
"kl": 0.00012505054473876953,
"learning_rate": 8.461538461538461e-07,
"loss": 0.0661,
"reward": 0.483148779720068,
"reward_std": 0.292447779327631,
"rewards/improved_len_reward_dast": 0.483148779720068,
"step": 33
},
{
"completion_length": 2044.69384765625,
"epoch": 0.0881399870382372,
"grad_norm": 0.19212723817368654,
"kl": 0.00014781951904296875,
"learning_rate": 8.717948717948718e-07,
"loss": 0.0301,
"reward": 0.3396348973037675,
"reward_std": 0.3017418198287487,
"rewards/improved_len_reward_dast": 0.3396348973037675,
"step": 34
},
{
"completion_length": 2474.2040405273438,
"epoch": 0.09073233959818536,
"grad_norm": 0.14515104577961038,
"kl": 0.0001646280288696289,
"learning_rate": 8.974358974358974e-07,
"loss": 0.0022,
"reward": 0.21619121730327606,
"reward_std": 0.25060467794537544,
"rewards/improved_len_reward_dast": 0.21619121730327606,
"step": 35
},
{
"completion_length": 2244.591827392578,
"epoch": 0.09332469215813351,
"grad_norm": 0.1551542493705291,
"kl": 0.00016069412231445312,
"learning_rate": 9.230769230769231e-07,
"loss": 0.0396,
"reward": 0.4676624909043312,
"reward_std": 0.24170640110969543,
"rewards/improved_len_reward_dast": 0.4676624909043312,
"step": 36
},
{
"completion_length": 2032.0714111328125,
"epoch": 0.09591704471808166,
"grad_norm": 0.17617571435946433,
"kl": 0.00017404556274414062,
"learning_rate": 9.487179487179486e-07,
"loss": 0.0694,
"reward": 0.5147057101130486,
"reward_std": 0.25003863498568535,
"rewards/improved_len_reward_dast": 0.5147057101130486,
"step": 37
},
{
"completion_length": 1702.9744262695312,
"epoch": 0.0985093972780298,
"grad_norm": 0.17747466302258563,
"kl": 0.00012755393981933594,
"learning_rate": 9.743589743589742e-07,
"loss": 0.0511,
"reward": 0.5134269595146179,
"reward_std": 0.25962407886981964,
"rewards/improved_len_reward_dast": 0.5134269595146179,
"step": 38
},
{
"completion_length": 2443.3265075683594,
"epoch": 0.10110174983797797,
"grad_norm": 0.1545853869841857,
"kl": 0.00020003318786621094,
"learning_rate": 1e-06,
"loss": 0.0057,
"reward": 0.1663860222324729,
"reward_std": 0.2878994420170784,
"rewards/improved_len_reward_dast": 0.1663860222324729,
"step": 39
},
{
"completion_length": 2536.2295532226562,
"epoch": 0.10369410239792612,
"grad_norm": 0.14472339566870965,
"kl": 0.00019025802612304688,
"learning_rate": 9.99981450718918e-07,
"loss": 0.0107,
"reward": 0.337845042347908,
"reward_std": 0.2574784606695175,
"rewards/improved_len_reward_dast": 0.337845042347908,
"step": 40
},
{
"completion_length": 1802.9540405273438,
"epoch": 0.10628645495787427,
"grad_norm": 0.18393179565451373,
"kl": 0.0001709461212158203,
"learning_rate": 9.99925804404898e-07,
"loss": 0.0556,
"reward": 0.2605009600520134,
"reward_std": 0.3141431324183941,
"rewards/improved_len_reward_dast": 0.2605009600520134,
"step": 41
},
{
"completion_length": 1914.5356750488281,
"epoch": 0.10887880751782242,
"grad_norm": 0.14563287705556602,
"kl": 0.00016033649444580078,
"learning_rate": 9.998330656454915e-07,
"loss": -0.0016,
"reward": 0.506085067987442,
"reward_std": 0.28512752801179886,
"rewards/improved_len_reward_dast": 0.506085067987442,
"step": 42
},
{
"completion_length": 1949.6172790527344,
"epoch": 0.11147116007777058,
"grad_norm": 0.1923555097878006,
"kl": 0.0002301931381225586,
"learning_rate": 9.99703242086198e-07,
"loss": 0.0342,
"reward": 0.3602943029254675,
"reward_std": 0.25087232142686844,
"rewards/improved_len_reward_dast": 0.3602943029254675,
"step": 43
},
{
"completion_length": 1847.9336242675781,
"epoch": 0.11406351263771873,
"grad_norm": 0.20544935031130743,
"kl": 0.0001766681671142578,
"learning_rate": 9.995363444298333e-07,
"loss": 0.0184,
"reward": 0.3888886272907257,
"reward_std": 0.32428839057683945,
"rewards/improved_len_reward_dast": 0.3888886272907257,
"step": 44
},
{
"completion_length": 2265.096923828125,
"epoch": 0.11665586519766688,
"grad_norm": 0.14292203845841508,
"kl": 0.00018930435180664062,
"learning_rate": 9.993323864356492e-07,
"loss": 0.0017,
"reward": 0.22207820555195212,
"reward_std": 0.25323856994509697,
"rewards/improved_len_reward_dast": 0.22207820555195212,
"step": 45
},
{
"completion_length": 2902.938720703125,
"epoch": 0.11924821775761503,
"grad_norm": 0.11292903909794,
"kl": 0.00022602081298828125,
"learning_rate": 9.990913849181977e-07,
"loss": 0.009,
"reward": 0.31735342741012573,
"reward_std": 0.23353197798132896,
"rewards/improved_len_reward_dast": 0.31735342741012573,
"step": 46
},
{
"completion_length": 1847.5662689208984,
"epoch": 0.1218405703175632,
"grad_norm": 0.16221094286352689,
"kl": 0.00019168853759765625,
"learning_rate": 9.988133597459444e-07,
"loss": 0.0308,
"reward": 0.41010551154613495,
"reward_std": 0.26410190016031265,
"rewards/improved_len_reward_dast": 0.41010551154613495,
"step": 47
},
{
"completion_length": 1890.3928527832031,
"epoch": 0.12443292287751134,
"grad_norm": 0.18056689939637202,
"kl": 0.00018024444580078125,
"learning_rate": 9.984983338396323e-07,
"loss": 0.0602,
"reward": 0.41444508731365204,
"reward_std": 0.19691497087478638,
"rewards/improved_len_reward_dast": 0.41444508731365204,
"step": 48
},
{
"completion_length": 1593.5561218261719,
"epoch": 0.1270252754374595,
"grad_norm": 0.175446467485199,
"kl": 0.0002167224884033203,
"learning_rate": 9.981463331703903e-07,
"loss": 0.0348,
"reward": 0.5070051103830338,
"reward_std": 0.21338175982236862,
"rewards/improved_len_reward_dast": 0.5070051103830338,
"step": 49
},
{
"completion_length": 1878.2346496582031,
"epoch": 0.12961762799740764,
"grad_norm": 0.19397023295879293,
"kl": 0.00023925304412841797,
"learning_rate": 9.977573867575937e-07,
"loss": -0.0055,
"reward": 0.3375612124800682,
"reward_std": 0.27106014266610146,
"rewards/improved_len_reward_dast": 0.3375612124800682,
"step": 50
},
{
"completion_length": 2093.2193756103516,
"epoch": 0.1322099805573558,
"grad_norm": 0.17897729131572065,
"kl": 0.0003161430358886719,
"learning_rate": 9.9733152666647e-07,
"loss": 0.0406,
"reward": 0.391084011644125,
"reward_std": 0.2787310928106308,
"rewards/improved_len_reward_dast": 0.391084011644125,
"step": 51
},
{
"completion_length": 2547.8162841796875,
"epoch": 0.13480233311730394,
"grad_norm": 0.21312563621246147,
"kl": 0.0002579689025878906,
"learning_rate": 9.968687880054579e-07,
"loss": 0.0204,
"reward": 0.4913594201207161,
"reward_std": 0.22158093005418777,
"rewards/improved_len_reward_dast": 0.4913594201207161,
"step": 52
},
{
"completion_length": 1667.8571166992188,
"epoch": 0.1373946856772521,
"grad_norm": 0.20488856136613437,
"kl": 0.0003466606140136719,
"learning_rate": 9.963692089233104e-07,
"loss": 0.0199,
"reward": 0.33586448058485985,
"reward_std": 0.3006225787103176,
"rewards/improved_len_reward_dast": 0.33586448058485985,
"step": 53
},
{
"completion_length": 1683.8571166992188,
"epoch": 0.13998703823720027,
"grad_norm": 0.16794693148171316,
"kl": 0.0003898143768310547,
"learning_rate": 9.958328306059508e-07,
"loss": 0.0545,
"reward": 0.40445420145988464,
"reward_std": 0.2224370762705803,
"rewards/improved_len_reward_dast": 0.40445420145988464,
"step": 54
},
{
"completion_length": 1890.1173400878906,
"epoch": 0.1425793907971484,
"grad_norm": 0.23753881445921649,
"kl": 0.0004630088806152344,
"learning_rate": 9.952596972730782e-07,
"loss": 0.0328,
"reward": 0.2540663415566087,
"reward_std": 0.19413560815155506,
"rewards/improved_len_reward_dast": 0.2540663415566087,
"step": 55
},
{
"completion_length": 1981.3163146972656,
"epoch": 0.14517174335709657,
"grad_norm": 0.17990321241121518,
"kl": 0.0003933906555175781,
"learning_rate": 9.946498561745201e-07,
"loss": 0.0525,
"reward": 0.43116573989391327,
"reward_std": 0.27771833911538124,
"rewards/improved_len_reward_dast": 0.43116573989391327,
"step": 56
},
{
"completion_length": 2039.1785583496094,
"epoch": 0.14776409591704473,
"grad_norm": 0.1846330840092854,
"kl": 0.0004582405090332031,
"learning_rate": 9.94003357586339e-07,
"loss": 0.039,
"reward": 0.4446622207760811,
"reward_std": 0.27240753918886185,
"rewards/improved_len_reward_dast": 0.4446622207760811,
"step": 57
},
{
"completion_length": 2208.244842529297,
"epoch": 0.15035644847699287,
"grad_norm": 0.18237281644603112,
"kl": 0.00033092498779296875,
"learning_rate": 9.933202548066855e-07,
"loss": 0.019,
"reward": 0.40539775788784027,
"reward_std": 0.2241816557943821,
"rewards/improved_len_reward_dast": 0.40539775788784027,
"step": 58
},
{
"completion_length": 2297.8978881835938,
"epoch": 0.15294880103694103,
"grad_norm": 0.14330971261584569,
"kl": 0.00047016143798828125,
"learning_rate": 9.926006041514068e-07,
"loss": 0.0525,
"reward": 0.42655882239341736,
"reward_std": 0.31429572589695454,
"rewards/improved_len_reward_dast": 0.42655882239341736,
"step": 59
},
{
"completion_length": 2013.0254821777344,
"epoch": 0.15554115359688916,
"grad_norm": 0.17629803487682835,
"kl": 0.0005960464477539062,
"learning_rate": 9.918444649494012e-07,
"loss": 0.0608,
"reward": 0.48641955107450485,
"reward_std": 0.20150578767061234,
"rewards/improved_len_reward_dast": 0.48641955107450485,
"step": 60
},
{
"completion_length": 2207.5408325195312,
"epoch": 0.15813350615683733,
"grad_norm": 0.15781143039008477,
"kl": 0.0005693435668945312,
"learning_rate": 9.9105189953773e-07,
"loss": 0.0303,
"reward": 0.4598369002342224,
"reward_std": 0.27581632137298584,
"rewards/improved_len_reward_dast": 0.4598369002342224,
"step": 61
},
{
"completion_length": 2301.9642639160156,
"epoch": 0.1607258587167855,
"grad_norm": 0.18692743277703477,
"kl": 0.0006175041198730469,
"learning_rate": 9.90222973256475e-07,
"loss": 0.0776,
"reward": 0.5364214852452278,
"reward_std": 0.2540983334183693,
"rewards/improved_len_reward_dast": 0.5364214852452278,
"step": 62
},
{
"completion_length": 2645.9795532226562,
"epoch": 0.16331821127673363,
"grad_norm": 0.1418610635211088,
"kl": 0.0006427764892578125,
"learning_rate": 9.89357754443355e-07,
"loss": -0.0025,
"reward": 0.34263312071561813,
"reward_std": 0.18951043859124184,
"rewards/improved_len_reward_dast": 0.34263312071561813,
"step": 63
},
{
"completion_length": 2645.8213500976562,
"epoch": 0.1659105638366818,
"grad_norm": 0.153618608876278,
"kl": 0.0007190704345703125,
"learning_rate": 9.884563144280897e-07,
"loss": 0.0668,
"reward": 0.38441576063632965,
"reward_std": 0.23758746683597565,
"rewards/improved_len_reward_dast": 0.38441576063632965,
"step": 64
},
{
"completion_length": 1942.7499389648438,
"epoch": 0.16850291639662995,
"grad_norm": 0.1677620586294242,
"kl": 0.0009021759033203125,
"learning_rate": 9.875187275265198e-07,
"loss": 0.0045,
"reward": 0.463797003030777,
"reward_std": 0.22495094686746597,
"rewards/improved_len_reward_dast": 0.463797003030777,
"step": 65
},
{
"completion_length": 2529.637664794922,
"epoch": 0.1710952689565781,
"grad_norm": 0.16610955001052316,
"kl": 0.001155853271484375,
"learning_rate": 9.865450710344807e-07,
"loss": 0.0273,
"reward": 0.28665875643491745,
"reward_std": 0.2632176913321018,
"rewards/improved_len_reward_dast": 0.28665875643491745,
"step": 66
},
{
"completion_length": 2206.5152587890625,
"epoch": 0.17368762151652625,
"grad_norm": 0.1800502801294064,
"kl": 0.0015268325805664062,
"learning_rate": 9.855354252214307e-07,
"loss": 0.0485,
"reward": 0.3745214883238077,
"reward_std": 0.2947724014520645,
"rewards/improved_len_reward_dast": 0.3745214883238077,
"step": 67
},
{
"completion_length": 1694.0305480957031,
"epoch": 0.1762799740764744,
"grad_norm": 0.17423141164815986,
"kl": 0.0011110305786132812,
"learning_rate": 9.844898733238311e-07,
"loss": 0.0307,
"reward": 0.5885476693511009,
"reward_std": 0.23455755040049553,
"rewards/improved_len_reward_dast": 0.5885476693511009,
"step": 68
},
{
"completion_length": 2007.4744873046875,
"epoch": 0.17887232663642255,
"grad_norm": 0.15896758825580698,
"kl": 0.001338958740234375,
"learning_rate": 9.83408501538287e-07,
"loss": 0.0007,
"reward": 0.31307457387447357,
"reward_std": 0.28911132737994194,
"rewards/improved_len_reward_dast": 0.31307457387447357,
"step": 69
},
{
"completion_length": 1773.3979187011719,
"epoch": 0.18146467919637072,
"grad_norm": 0.1796062383107639,
"kl": 0.0017347335815429688,
"learning_rate": 9.822913990144387e-07,
"loss": -0.0399,
"reward": 0.37001069262623787,
"reward_std": 0.28749338537454605,
"rewards/improved_len_reward_dast": 0.37001069262623787,
"step": 70
},
{
"completion_length": 2173.596893310547,
"epoch": 0.18405703175631885,
"grad_norm": 0.16114217103117812,
"kl": 0.0013170242309570312,
"learning_rate": 9.811386578476146e-07,
"loss": 0.0026,
"reward": 0.43062953650951385,
"reward_std": 0.2803415507078171,
"rewards/improved_len_reward_dast": 0.43062953650951385,
"step": 71
},
{
"completion_length": 1955.3214111328125,
"epoch": 0.18664938431626701,
"grad_norm": 0.19621793042471328,
"kl": 0.0011892318725585938,
"learning_rate": 9.79950373071236e-07,
"loss": 0.0391,
"reward": 0.4167153127491474,
"reward_std": 0.2199762761592865,
"rewards/improved_len_reward_dast": 0.4167153127491474,
"step": 72
},
{
"completion_length": 1949.1836242675781,
"epoch": 0.18924173687621518,
"grad_norm": 0.22914789827556842,
"kl": 0.0016918182373046875,
"learning_rate": 9.787266426489845e-07,
"loss": 0.0899,
"reward": 0.4319685846567154,
"reward_std": 0.22768162935972214,
"rewards/improved_len_reward_dast": 0.4319685846567154,
"step": 73
},
{
"completion_length": 2204.3724365234375,
"epoch": 0.1918340894361633,
"grad_norm": 0.1817187740768752,
"kl": 0.0019664764404296875,
"learning_rate": 9.77467567466725e-07,
"loss": -0.0129,
"reward": 0.32414303719997406,
"reward_std": 0.31503428146243095,
"rewards/improved_len_reward_dast": 0.32414303719997406,
"step": 74
},
{
"completion_length": 2346.0152282714844,
"epoch": 0.19442644199611148,
"grad_norm": 0.16770893885689953,
"kl": 0.0021991729736328125,
"learning_rate": 9.761732513241882e-07,
"loss": 0.0258,
"reward": 0.42492585629224777,
"reward_std": 0.22089344635605812,
"rewards/improved_len_reward_dast": 0.42492585629224777,
"step": 75
},
{
"completion_length": 2032.0764465332031,
"epoch": 0.1970187945560596,
"grad_norm": 0.22921042830992452,
"kl": 0.00289154052734375,
"learning_rate": 9.748438009264142e-07,
"loss": 0.0577,
"reward": 0.5482478961348534,
"reward_std": 0.23817146569490433,
"rewards/improved_len_reward_dast": 0.5482478961348534,
"step": 76
},
{
"completion_length": 2265.1071166992188,
"epoch": 0.19961114711600778,
"grad_norm": 0.16354175184269162,
"kl": 0.002227783203125,
"learning_rate": 9.734793258749538e-07,
"loss": 0.0147,
"reward": 0.4374894965440035,
"reward_std": 0.20623359642922878,
"rewards/improved_len_reward_dast": 0.4374894965440035,
"step": 77
},
{
"completion_length": 2333.576446533203,
"epoch": 0.20220349967595594,
"grad_norm": 0.16254218312259427,
"kl": 0.002429962158203125,
"learning_rate": 9.720799386588358e-07,
"loss": -0.0152,
"reward": 0.3343161977827549,
"reward_std": 0.3130199611186981,
"rewards/improved_len_reward_dast": 0.3343161977827549,
"step": 78
},
{
"completion_length": 2167.6376953125,
"epoch": 0.20479585223590407,
"grad_norm": 0.16592757190255583,
"kl": 0.003253936767578125,
"learning_rate": 9.706457546452898e-07,
"loss": -0.0054,
"reward": 0.33338295854628086,
"reward_std": 0.19700353033840656,
"rewards/improved_len_reward_dast": 0.33338295854628086,
"step": 79
},
{
"completion_length": 2714.066192626953,
"epoch": 0.20738820479585224,
"grad_norm": 0.1643380510367972,
"kl": 0.00283050537109375,
"learning_rate": 9.691768920702379e-07,
"loss": 0.0534,
"reward": 0.27776505425572395,
"reward_std": 0.2255413942039013,
"rewards/improved_len_reward_dast": 0.27776505425572395,
"step": 80
},
{
"completion_length": 2177.2754516601562,
"epoch": 0.2099805573558004,
"grad_norm": 0.17174078043546434,
"kl": 0.002231597900390625,
"learning_rate": 9.676734720285456e-07,
"loss": 0.0014,
"reward": 0.3496297672390938,
"reward_std": 0.2598918229341507,
"rewards/improved_len_reward_dast": 0.3496297672390938,
"step": 81
},
{
"completion_length": 2181.4642639160156,
"epoch": 0.21257290991574854,
"grad_norm": 0.1541322467727904,
"kl": 0.0022487640380859375,
"learning_rate": 9.661356184640394e-07,
"loss": 0.0224,
"reward": 0.5033976063132286,
"reward_std": 0.22605930641293526,
"rewards/improved_len_reward_dast": 0.5033976063132286,
"step": 82
},
{
"completion_length": 2148.6376953125,
"epoch": 0.2151652624756967,
"grad_norm": 0.24078433672821112,
"kl": 0.003170013427734375,
"learning_rate": 9.64563458159288e-07,
"loss": 0.075,
"reward": 0.4664422944188118,
"reward_std": 0.3008403405547142,
"rewards/improved_len_reward_dast": 0.4664422944188118,
"step": 83
},
{
"completion_length": 2446.83154296875,
"epoch": 0.21775761503564484,
"grad_norm": 0.23474448660395109,
"kl": 0.002742767333984375,
"learning_rate": 9.629571207251515e-07,
"loss": 0.0716,
"reward": 0.4225612059235573,
"reward_std": 0.24569111317396164,
"rewards/improved_len_reward_dast": 0.4225612059235573,
"step": 84
},
{
"completion_length": 2261.2294921875,
"epoch": 0.220349967595593,
"grad_norm": 0.18550775711299053,
"kl": 0.00246429443359375,
"learning_rate": 9.613167385900944e-07,
"loss": -0.0098,
"reward": 0.3237818730995059,
"reward_std": 0.23490814864635468,
"rewards/improved_len_reward_dast": 0.3237818730995059,
"step": 85
},
{
"completion_length": 2063.73974609375,
"epoch": 0.22294232015554116,
"grad_norm": 0.1851730490973125,
"kl": 0.002269744873046875,
"learning_rate": 9.59642446989269e-07,
"loss": -0.0064,
"reward": 0.42184900864958763,
"reward_std": 0.30175674334168434,
"rewards/improved_len_reward_dast": 0.42184900864958763,
"step": 86
},
{
"completion_length": 2311.551025390625,
"epoch": 0.2255346727154893,
"grad_norm": 0.21075878937848122,
"kl": 0.003543853759765625,
"learning_rate": 9.579343839533668e-07,
"loss": 0.0396,
"reward": 0.420402854681015,
"reward_std": 0.25258706137537956,
"rewards/improved_len_reward_dast": 0.420402854681015,
"step": 87
},
{
"completion_length": 2205.050994873047,
"epoch": 0.22812702527543746,
"grad_norm": 0.17968161341123623,
"kl": 0.0030517578125,
"learning_rate": 9.561926902972378e-07,
"loss": 0.0301,
"reward": 0.5036411881446838,
"reward_std": 0.24565044790506363,
"rewards/improved_len_reward_dast": 0.5036411881446838,
"step": 88
},
{
"completion_length": 1845.0509338378906,
"epoch": 0.23071937783538563,
"grad_norm": 0.2117750181509081,
"kl": 0.00276947021484375,
"learning_rate": 9.544175096082838e-07,
"loss": 0.0239,
"reward": 0.5442821085453033,
"reward_std": 0.2692565321922302,
"rewards/improved_len_reward_dast": 0.5442821085453033,
"step": 89
},
{
"completion_length": 1978.9234313964844,
"epoch": 0.23331173039533376,
"grad_norm": 0.17014312125843986,
"kl": 0.00310516357421875,
"learning_rate": 9.526089882346172e-07,
"loss": 0.0441,
"reward": 0.45555737614631653,
"reward_std": 0.2615541107952595,
"rewards/improved_len_reward_dast": 0.45555737614631653,
"step": 90
},
{
"completion_length": 2018.1785583496094,
"epoch": 0.23590408295528192,
"grad_norm": 0.18461058936642788,
"kl": 0.002346038818359375,
"learning_rate": 9.507672752730001e-07,
"loss": 0.036,
"reward": 0.2985446793027222,
"reward_std": 0.29198911786079407,
"rewards/improved_len_reward_dast": 0.2985446793027222,
"step": 91
},
{
"completion_length": 2159.846954345703,
"epoch": 0.23849643551523006,
"grad_norm": 0.18210923909541535,
"kl": 0.0027294158935546875,
"learning_rate": 9.4889252255655e-07,
"loss": 0.0328,
"reward": 0.41519875079393387,
"reward_std": 0.2224278450012207,
"rewards/improved_len_reward_dast": 0.41519875079393387,
"step": 92
},
{
"completion_length": 1984.6275024414062,
"epoch": 0.24108878807517822,
"grad_norm": 0.2141749380576668,
"kl": 0.00240325927734375,
"learning_rate": 9.469848846422223e-07,
"loss": -0.0175,
"reward": 0.26735120080411434,
"reward_std": 0.2445412389934063,
"rewards/improved_len_reward_dast": 0.26735120080411434,
"step": 93
},
{
"completion_length": 2195.3775329589844,
"epoch": 0.2436811406351264,
"grad_norm": 0.1762045925262696,
"kl": 0.00284576416015625,
"learning_rate": 9.450445187980699e-07,
"loss": -0.0218,
"reward": 0.33138592168688774,
"reward_std": 0.3225051313638687,
"rewards/improved_len_reward_dast": 0.33138592168688774,
"step": 94
},
{
"completion_length": 2310.7040100097656,
"epoch": 0.24627349319507452,
"grad_norm": 0.15765048447372448,
"kl": 0.00351715087890625,
"learning_rate": 9.430715849902774e-07,
"loss": 0.0401,
"reward": 0.43523500859737396,
"reward_std": 0.2462395802140236,
"rewards/improved_len_reward_dast": 0.43523500859737396,
"step": 95
},
{
"completion_length": 1937.6631774902344,
"epoch": 0.24886584575502269,
"grad_norm": 0.20879458110841193,
"kl": 0.00296783447265625,
"learning_rate": 9.410662458699723e-07,
"loss": 0.0195,
"reward": 0.46912187337875366,
"reward_std": 0.2178829275071621,
"rewards/improved_len_reward_dast": 0.46912187337875366,
"step": 96
},
{
"completion_length": 1418.3111877441406,
"epoch": 0.25145819831497085,
"grad_norm": 0.19578242445789368,
"kl": 0.002971649169921875,
"learning_rate": 9.390286667598169e-07,
"loss": 0.0235,
"reward": 0.4482320174574852,
"reward_std": 0.26671652123332024,
"rewards/improved_len_reward_dast": 0.4482320174574852,
"step": 97
},
{
"completion_length": 1816.6275024414062,
"epoch": 0.254050550874919,
"grad_norm": 0.22258502785393502,
"kl": 0.002674102783203125,
"learning_rate": 9.369590156403784e-07,
"loss": 0.0319,
"reward": 0.4338858500123024,
"reward_std": 0.26093247532844543,
"rewards/improved_len_reward_dast": 0.4338858500123024,
"step": 98
},
{
"completion_length": 1874.5663146972656,
"epoch": 0.2566429034348671,
"grad_norm": 0.21715573777250466,
"kl": 0.00341796875,
"learning_rate": 9.348574631362808e-07,
"loss": 0.0524,
"reward": 0.49130718410015106,
"reward_std": 0.2376222312450409,
"rewards/improved_len_reward_dast": 0.49130718410015106,
"step": 99
},
{
"completion_length": 1574.3213806152344,
"epoch": 0.2592352559948153,
"grad_norm": 0.17861488086020622,
"kl": 0.002559661865234375,
"learning_rate": 9.327241825021379e-07,
"loss": 0.0113,
"reward": 0.6482488512992859,
"reward_std": 0.18520537950098515,
"rewards/improved_len_reward_dast": 0.6482488512992859,
"step": 100
},
{
"completion_length": 2049.6734619140625,
"epoch": 0.26182760855476345,
"grad_norm": 0.18885164201528534,
"kl": 0.003215789794921875,
"learning_rate": 9.3055934960827e-07,
"loss": 0.0223,
"reward": 0.4320410490036011,
"reward_std": 0.2535330019891262,
"rewards/improved_len_reward_dast": 0.4320410490036011,
"step": 101
},
{
"completion_length": 1969.6479187011719,
"epoch": 0.2644199611147116,
"grad_norm": 0.18844502513210584,
"kl": 0.003604888916015625,
"learning_rate": 9.283631429262053e-07,
"loss": 0.0465,
"reward": 0.4825830012559891,
"reward_std": 0.2706633023917675,
"rewards/improved_len_reward_dast": 0.4825830012559891,
"step": 102
},
{
"completion_length": 1973.2142333984375,
"epoch": 0.2670123136746598,
"grad_norm": 0.17577302935565667,
"kl": 0.00371551513671875,
"learning_rate": 9.261357435139665e-07,
"loss": -0.0154,
"reward": 0.2741311024874449,
"reward_std": 0.2971944361925125,
"rewards/improved_len_reward_dast": 0.2741311024874449,
"step": 103
},
{
"completion_length": 1952.841796875,
"epoch": 0.2696046662346079,
"grad_norm": 0.20674937772252322,
"kl": 0.0036773681640625,
"learning_rate": 9.238773350011437e-07,
"loss": 0.0516,
"reward": 0.5049019902944565,
"reward_std": 0.23485657200217247,
"rewards/improved_len_reward_dast": 0.5049019902944565,
"step": 104
},
{
"completion_length": 1959.0458679199219,
"epoch": 0.27219701879455604,
"grad_norm": 0.17037566789121353,
"kl": 0.003337860107421875,
"learning_rate": 9.215881035737557e-07,
"loss": 0.0019,
"reward": 0.5687128752470016,
"reward_std": 0.26400860771536827,
"rewards/improved_len_reward_dast": 0.5687128752470016,
"step": 105
},
{
"completion_length": 2062.1377563476562,
"epoch": 0.2747893713545042,
"grad_norm": 0.18451392997367894,
"kl": 0.003459930419921875,
"learning_rate": 9.192682379589017e-07,
"loss": 0.0431,
"reward": 0.5168322995305061,
"reward_std": 0.3160136565566063,
"rewards/improved_len_reward_dast": 0.5168322995305061,
"step": 106
},
{
"completion_length": 2263.1478576660156,
"epoch": 0.27738172391445237,
"grad_norm": 0.14735079239054202,
"kl": 0.0033721923828125,
"learning_rate": 9.169179294092006e-07,
"loss": 0.0215,
"reward": 0.4763122648000717,
"reward_std": 0.2259252481162548,
"rewards/improved_len_reward_dast": 0.4763122648000717,
"step": 107
},
{
"completion_length": 1663.2958679199219,
"epoch": 0.27997407647440054,
"grad_norm": 0.18037575865019131,
"kl": 0.003185272216796875,
"learning_rate": 9.145373716870257e-07,
"loss": -0.0085,
"reward": 0.4951760992407799,
"reward_std": 0.2437908910214901,
"rewards/improved_len_reward_dast": 0.4951760992407799,
"step": 108
},
{
"completion_length": 2391.9795837402344,
"epoch": 0.2825664290343487,
"grad_norm": 0.19641500331835074,
"kl": 0.00460052490234375,
"learning_rate": 9.121267610485294e-07,
"loss": 0.0161,
"reward": 0.3704775348305702,
"reward_std": 0.26634273678064346,
"rewards/improved_len_reward_dast": 0.3704775348305702,
"step": 109
},
{
"completion_length": 1835.2346801757812,
"epoch": 0.2851587815942968,
"grad_norm": 0.16477117090344476,
"kl": 0.00339508056640625,
"learning_rate": 9.096862962274642e-07,
"loss": -0.0073,
"reward": 0.4292480945587158,
"reward_std": 0.27501288428902626,
"rewards/improved_len_reward_dast": 0.4292480945587158,
"step": 110
},
{
"completion_length": 2104.132568359375,
"epoch": 0.28775113415424497,
"grad_norm": 0.19390179121770934,
"kl": 0.004150390625,
"learning_rate": 9.072161784187988e-07,
"loss": 0.0098,
"reward": 0.3341917209327221,
"reward_std": 0.30071910470724106,
"rewards/improved_len_reward_dast": 0.3341917209327221,
"step": 111
},
{
"completion_length": 1651.9744262695312,
"epoch": 0.29034348671419313,
"grad_norm": 0.17112243727895823,
"kl": 0.003452301025390625,
"learning_rate": 9.047166112621312e-07,
"loss": 0.0251,
"reward": 0.5470812171697617,
"reward_std": 0.21900975704193115,
"rewards/improved_len_reward_dast": 0.5470812171697617,
"step": 112
},
{
"completion_length": 1929.290771484375,
"epoch": 0.2929358392741413,
"grad_norm": 0.15871059979904864,
"kl": 0.004146575927734375,
"learning_rate": 9.021878008249001e-07,
"loss": -0.0091,
"reward": 0.4469901919364929,
"reward_std": 0.3005082905292511,
"rewards/improved_len_reward_dast": 0.4469901919364929,
"step": 113
},
{
"completion_length": 1670.8672790527344,
"epoch": 0.29552819183408946,
"grad_norm": 0.20014729950778795,
"kl": 0.00397491455078125,
"learning_rate": 8.996299555853973e-07,
"loss": 0.0048,
"reward": 0.4352063946425915,
"reward_std": 0.30053506791591644,
"rewards/improved_len_reward_dast": 0.4352063946425915,
"step": 114
},
{
"completion_length": 1748.6427917480469,
"epoch": 0.29812054439403757,
"grad_norm": 0.19064081554515172,
"kl": 0.003910064697265625,
"learning_rate": 8.970432864155798e-07,
"loss": 0.0279,
"reward": 0.4250146150588989,
"reward_std": 0.28123610466718674,
"rewards/improved_len_reward_dast": 0.4250146150588989,
"step": 115
},
{
"completion_length": 1927.2347106933594,
"epoch": 0.30071289695398573,
"grad_norm": 0.19497710947535807,
"kl": 0.00446319580078125,
"learning_rate": 8.944280065636851e-07,
"loss": -0.0085,
"reward": 0.48724526911973953,
"reward_std": 0.25488007813692093,
"rewards/improved_len_reward_dast": 0.48724526911973953,
"step": 116
},
{
"completion_length": 1818.4541015625,
"epoch": 0.3033052495139339,
"grad_norm": 0.34088335134181214,
"kl": 0.004119873046875,
"learning_rate": 8.917843316366515e-07,
"loss": -0.0105,
"reward": 0.4738898351788521,
"reward_std": 0.23307713866233826,
"rewards/improved_len_reward_dast": 0.4738898351788521,
"step": 117
},
{
"completion_length": 2024.6478881835938,
"epoch": 0.30589760207388206,
"grad_norm": 0.21372750051520226,
"kl": 0.005035400390625,
"learning_rate": 8.891124795823426e-07,
"loss": -0.0181,
"reward": 0.3491591773927212,
"reward_std": 0.23684632405638695,
"rewards/improved_len_reward_dast": 0.3491591773927212,
"step": 118
},
{
"completion_length": 1981.9183349609375,
"epoch": 0.3084899546338302,
"grad_norm": 0.16944043573052223,
"kl": 0.00438690185546875,
"learning_rate": 8.864126706715796e-07,
"loss": 0.0496,
"reward": 0.5126907303929329,
"reward_std": 0.186638955026865,
"rewards/improved_len_reward_dast": 0.5126907303929329,
"step": 119
},
{
"completion_length": 2106.7754821777344,
"epoch": 0.31108230719377833,
"grad_norm": 0.17722812898313278,
"kl": 0.0058135986328125,
"learning_rate": 8.83685127479982e-07,
"loss": 0.0527,
"reward": 0.48729025572538376,
"reward_std": 0.1900501400232315,
"rewards/improved_len_reward_dast": 0.48729025572538376,
"step": 120
},
{
"completion_length": 1759.3673095703125,
"epoch": 0.3136746597537265,
"grad_norm": 0.18128869032474573,
"kl": 0.0042724609375,
"learning_rate": 8.809300748696173e-07,
"loss": 0.0305,
"reward": 0.45106371864676476,
"reward_std": 0.2715425603091717,
"rewards/improved_len_reward_dast": 0.45106371864676476,
"step": 121
},
{
"completion_length": 2413.795867919922,
"epoch": 0.31626701231367466,
"grad_norm": 0.18347422721089623,
"kl": 0.00518035888671875,
"learning_rate": 8.781477399704652e-07,
"loss": 0.0274,
"reward": 0.3711659908294678,
"reward_std": 0.2643970772624016,
"rewards/improved_len_reward_dast": 0.3711659908294678,
"step": 122
},
{
"completion_length": 2341.85205078125,
"epoch": 0.3188593648736228,
"grad_norm": 0.16392258093452686,
"kl": 0.00547027587890625,
"learning_rate": 8.753383521616902e-07,
"loss": -0.0124,
"reward": 0.4358869791030884,
"reward_std": 0.25290554389357567,
"rewards/improved_len_reward_dast": 0.4358869791030884,
"step": 123
},
{
"completion_length": 1800.4285278320312,
"epoch": 0.321451717433571,
"grad_norm": 0.19958740910637524,
"kl": 0.004329681396484375,
"learning_rate": 8.72502143052733e-07,
"loss": -0.032,
"reward": 0.38679035007953644,
"reward_std": 0.24552029743790627,
"rewards/improved_len_reward_dast": 0.38679035007953644,
"step": 124
},
{
"completion_length": 1991.841796875,
"epoch": 0.32404406999351915,
"grad_norm": 0.1577890471913444,
"kl": 0.004730224609375,
"learning_rate": 8.696393464642158e-07,
"loss": 0.0002,
"reward": 0.5458214432001114,
"reward_std": 0.23444852605462074,
"rewards/improved_len_reward_dast": 0.5458214432001114,
"step": 125
},
{
"completion_length": 1776.3826293945312,
"epoch": 0.32663642255346725,
"grad_norm": 0.17200589761505877,
"kl": 0.004123687744140625,
"learning_rate": 8.667501984086655e-07,
"loss": 0.0051,
"reward": 0.5649043023586273,
"reward_std": 0.22592130675911903,
"rewards/improved_len_reward_dast": 0.5649043023586273,
"step": 126
},
{
"completion_length": 1802.2703857421875,
"epoch": 0.3292287751134154,
"grad_norm": 0.17760415544496286,
"kl": 0.00440216064453125,
"learning_rate": 8.638349370710573e-07,
"loss": 0.0208,
"reward": 0.4263976775109768,
"reward_std": 0.23108776286244392,
"rewards/improved_len_reward_dast": 0.4263976775109768,
"step": 127
},
{
"completion_length": 1714.5101623535156,
"epoch": 0.3318211276733636,
"grad_norm": 0.19240596915490546,
"kl": 0.004238128662109375,
"learning_rate": 8.608938027891775e-07,
"loss": -0.0168,
"reward": 0.5219497531652451,
"reward_std": 0.28100451827049255,
"rewards/improved_len_reward_dast": 0.5219497531652451,
"step": 128
},
{
"completion_length": 1703.540771484375,
"epoch": 0.33441348023331174,
"grad_norm": 0.1869791882546724,
"kl": 0.004779815673828125,
"learning_rate": 8.579270380338107e-07,
"loss": 0.0213,
"reward": 0.5599559545516968,
"reward_std": 0.20541859790682793,
"rewards/improved_len_reward_dast": 0.5599559545516968,
"step": 129
},
{
"completion_length": 2307.8009338378906,
"epoch": 0.3370058327932599,
"grad_norm": 0.16311538988092006,
"kl": 0.005146026611328125,
"learning_rate": 8.549348873887496e-07,
"loss": 0.0135,
"reward": 0.3861619606614113,
"reward_std": 0.2727292329072952,
"rewards/improved_len_reward_dast": 0.3861619606614113,
"step": 130
},
{
"completion_length": 1866.1173095703125,
"epoch": 0.339598185353208,
"grad_norm": 0.2056998520073749,
"kl": 0.003955841064453125,
"learning_rate": 8.519175975306312e-07,
"loss": 0.0417,
"reward": 0.24037051759660244,
"reward_std": 0.3439597971737385,
"rewards/improved_len_reward_dast": 0.24037051759660244,
"step": 131
},
{
"completion_length": 1911.8265075683594,
"epoch": 0.3421905379131562,
"grad_norm": 0.2317203557971351,
"kl": 0.004306793212890625,
"learning_rate": 8.48875417208601e-07,
"loss": 0.0452,
"reward": 0.47868431359529495,
"reward_std": 0.23706890270113945,
"rewards/improved_len_reward_dast": 0.47868431359529495,
"step": 132
},
{
"completion_length": 1986.4744567871094,
"epoch": 0.34478289047310434,
"grad_norm": 0.17738281540623924,
"kl": 0.00514984130859375,
"learning_rate": 8.458085972238048e-07,
"loss": 0.0072,
"reward": 0.28852372616529465,
"reward_std": 0.2508459724485874,
"rewards/improved_len_reward_dast": 0.28852372616529465,
"step": 133
},
{
"completion_length": 1994.0815734863281,
"epoch": 0.3473752430330525,
"grad_norm": 0.1997705536336717,
"kl": 0.00603485107421875,
"learning_rate": 8.427173904087138e-07,
"loss": -0.0078,
"reward": 0.36212442070245743,
"reward_std": 0.2850674279034138,
"rewards/improved_len_reward_dast": 0.36212442070245743,
"step": 134
},
{
"completion_length": 2514.382598876953,
"epoch": 0.34996759559300067,
"grad_norm": 0.15208989698192185,
"kl": 0.00661468505859375,
"learning_rate": 8.396020516062794e-07,
"loss": 0.0032,
"reward": 0.4105417560786009,
"reward_std": 0.22361259534955025,
"rewards/improved_len_reward_dast": 0.4105417560786009,
"step": 135
},
{
"completion_length": 1611.4693908691406,
"epoch": 0.3525599481529488,
"grad_norm": 0.19115046617358702,
"kl": 0.00412750244140625,
"learning_rate": 8.364628376489242e-07,
"loss": 0.0441,
"reward": 0.5174260064959526,
"reward_std": 0.2693428471684456,
"rewards/improved_len_reward_dast": 0.5174260064959526,
"step": 136
},
{
"completion_length": 1408.8061218261719,
"epoch": 0.35515230071289694,
"grad_norm": 0.1889385097712907,
"kl": 0.00432586669921875,
"learning_rate": 8.333000073373685e-07,
"loss": -0.0013,
"reward": 0.5440054759383202,
"reward_std": 0.19834023714065552,
"rewards/improved_len_reward_dast": 0.5440054759383202,
"step": 137
},
{
"completion_length": 1900.6836242675781,
"epoch": 0.3577446532728451,
"grad_norm": 0.1681565313739815,
"kl": 0.00485992431640625,
"learning_rate": 8.301138214192945e-07,
"loss": -0.0184,
"reward": 0.49239661544561386,
"reward_std": 0.29014211893081665,
"rewards/improved_len_reward_dast": 0.49239661544561386,
"step": 138
},
{
"completion_length": 1999.6376953125,
"epoch": 0.36033700583279327,
"grad_norm": 0.15882776364135623,
"kl": 0.00577545166015625,
"learning_rate": 8.269045425678497e-07,
"loss": 0.0007,
"reward": 0.49145303666591644,
"reward_std": 0.23764513432979584,
"rewards/improved_len_reward_dast": 0.49145303666591644,
"step": 139
},
{
"completion_length": 1802.9540405273438,
"epoch": 0.36292935839274143,
"grad_norm": 0.17674925264442287,
"kl": 0.00507354736328125,
"learning_rate": 8.236724353599918e-07,
"loss": 0.0455,
"reward": 0.5860550999641418,
"reward_std": 0.27273107320070267,
"rewards/improved_len_reward_dast": 0.5860550999641418,
"step": 140
},
{
"completion_length": 1764.2857360839844,
"epoch": 0.36552171095268954,
"grad_norm": 0.20452351841611824,
"kl": 0.005157470703125,
"learning_rate": 8.204177662546763e-07,
"loss": 0.0623,
"reward": 0.5012878403067589,
"reward_std": 0.24189992249011993,
"rewards/improved_len_reward_dast": 0.5012878403067589,
"step": 141
},
{
"completion_length": 1690.7754516601562,
"epoch": 0.3681140635126377,
"grad_norm": 0.20408361176282866,
"kl": 0.005096435546875,
"learning_rate": 8.171408035708906e-07,
"loss": 0.0395,
"reward": 0.5669431537389755,
"reward_std": 0.21233107149600983,
"rewards/improved_len_reward_dast": 0.5669431537389755,
"step": 142
},
{
"completion_length": 1466.1887512207031,
"epoch": 0.37070641607258586,
"grad_norm": 0.18540115151616593,
"kl": 0.004268646240234375,
"learning_rate": 8.138418174655323e-07,
"loss": 0.0175,
"reward": 0.5513587966561317,
"reward_std": 0.23814994096755981,
"rewards/improved_len_reward_dast": 0.5513587966561317,
"step": 143
},
{
"completion_length": 2337.9540405273438,
"epoch": 0.37329876863253403,
"grad_norm": 0.16911682150078117,
"kl": 0.00580596923828125,
"learning_rate": 8.105210799111366e-07,
"loss": 0.0186,
"reward": 0.4387804791331291,
"reward_std": 0.22327708080410957,
"rewards/improved_len_reward_dast": 0.4387804791331291,
"step": 144
},
{
"completion_length": 1876.8162536621094,
"epoch": 0.3758911211924822,
"grad_norm": 0.1658748038113864,
"kl": 0.005420684814453125,
"learning_rate": 8.071788646734564e-07,
"loss": -0.0219,
"reward": 0.49322987347841263,
"reward_std": 0.17780348286032677,
"rewards/improved_len_reward_dast": 0.49322987347841263,
"step": 145
},
{
"completion_length": 1766.6683654785156,
"epoch": 0.37848347375243035,
"grad_norm": 0.15094962051933156,
"kl": 0.00490570068359375,
"learning_rate": 8.038154472888909e-07,
"loss": -0.0071,
"reward": 0.5451386570930481,
"reward_std": 0.24280473217368126,
"rewards/improved_len_reward_dast": 0.5451386570930481,
"step": 146
},
{
"completion_length": 1868.7346801757812,
"epoch": 0.38107582631237846,
"grad_norm": 0.16172171063714397,
"kl": 0.00629425048828125,
"learning_rate": 8.004311050417711e-07,
"loss": -0.0304,
"reward": 0.5152218118309975,
"reward_std": 0.2387254200875759,
"rewards/improved_len_reward_dast": 0.5152218118309975,
"step": 147
},
{
"completion_length": 1790.6223754882812,
"epoch": 0.3836681788723266,
"grad_norm": 0.1743302771283738,
"kl": 0.005558013916015625,
"learning_rate": 7.970261169414999e-07,
"loss": 0.0191,
"reward": 0.48244544118642807,
"reward_std": 0.22489535436034203,
"rewards/improved_len_reward_dast": 0.48244544118642807,
"step": 148
},
{
"completion_length": 2116.790771484375,
"epoch": 0.3862605314322748,
"grad_norm": 0.17221394034022652,
"kl": 0.00658416748046875,
"learning_rate": 7.936007636995497e-07,
"loss": 0.0313,
"reward": 0.47787267714738846,
"reward_std": 0.2161446176469326,
"rewards/improved_len_reward_dast": 0.47787267714738846,
"step": 149
},
{
"completion_length": 1756.3367309570312,
"epoch": 0.38885288399222295,
"grad_norm": 0.21564957617935493,
"kl": 0.005767822265625,
"learning_rate": 7.901553277063213e-07,
"loss": 0.0672,
"reward": 0.3681907616555691,
"reward_std": 0.2732224613428116,
"rewards/improved_len_reward_dast": 0.3681907616555691,
"step": 150
},
{
"completion_length": 2132.2958984375,
"epoch": 0.3914452365521711,
"grad_norm": 0.21594042558519996,
"kl": 0.00696563720703125,
"learning_rate": 7.866900930078618e-07,
"loss": 0.0453,
"reward": 0.44704771041870117,
"reward_std": 0.27593713626265526,
"rewards/improved_len_reward_dast": 0.44704771041870117,
"step": 151
},
{
"completion_length": 1927.5152282714844,
"epoch": 0.3940375891121192,
"grad_norm": 0.1610545275313625,
"kl": 0.0055389404296875,
"learning_rate": 7.832053452824489e-07,
"loss": 0.0042,
"reward": 0.5329347252845764,
"reward_std": 0.24699966236948967,
"rewards/improved_len_reward_dast": 0.5329347252845764,
"step": 152
},
{
"completion_length": 1966.3009948730469,
"epoch": 0.3966299416720674,
"grad_norm": 0.16310455930995557,
"kl": 0.00768280029296875,
"learning_rate": 7.797013718170384e-07,
"loss": -0.0202,
"reward": 0.4772297702729702,
"reward_std": 0.22153976559638977,
"rewards/improved_len_reward_dast": 0.4772297702729702,
"step": 153
},
{
"completion_length": 1658.9846954345703,
"epoch": 0.39922229423201555,
"grad_norm": 0.20975108303665252,
"kl": 0.005176544189453125,
"learning_rate": 7.761784614835801e-07,
"loss": 0.0531,
"reward": 0.44522225111722946,
"reward_std": 0.2452440857887268,
"rewards/improved_len_reward_dast": 0.44522225111722946,
"step": 154
},
{
"completion_length": 1932.2856903076172,
"epoch": 0.4018146467919637,
"grad_norm": 0.1943564114200122,
"kl": 0.0059356689453125,
"learning_rate": 7.726369047152029e-07,
"loss": -0.0237,
"reward": 0.42587200179696083,
"reward_std": 0.22648616321384907,
"rewards/improved_len_reward_dast": 0.42587200179696083,
"step": 155
},
{
"completion_length": 1716.0968933105469,
"epoch": 0.4044069993519119,
"grad_norm": 0.15181408541122274,
"kl": 0.004180908203125,
"learning_rate": 7.690769934822712e-07,
"loss": 0.0045,
"reward": 0.5171971023082733,
"reward_std": 0.2628367580473423,
"rewards/improved_len_reward_dast": 0.5171971023082733,
"step": 156
},
{
"completion_length": 1752.392822265625,
"epoch": 0.40699935191186,
"grad_norm": 0.18038381295579756,
"kl": 0.00592803955078125,
"learning_rate": 7.654990212683142e-07,
"loss": 0.0106,
"reward": 0.5561209693551064,
"reward_std": 0.2517809383571148,
"rewards/improved_len_reward_dast": 0.5561209693551064,
"step": 157
},
{
"completion_length": 1883.0203247070312,
"epoch": 0.40959170447180815,
"grad_norm": 0.1789471226238438,
"kl": 0.0065460205078125,
"learning_rate": 7.619032830458307e-07,
"loss": 0.0237,
"reward": 0.5727267265319824,
"reward_std": 0.2596842758357525,
"rewards/improved_len_reward_dast": 0.5727267265319824,
"step": 158
},
{
"completion_length": 2159.1223754882812,
"epoch": 0.4121840570317563,
"grad_norm": 0.19892754911241722,
"kl": 0.006744384765625,
"learning_rate": 7.582900752519723e-07,
"loss": 0.0183,
"reward": 0.4676053449511528,
"reward_std": 0.2507024519145489,
"rewards/improved_len_reward_dast": 0.4676053449511528,
"step": 159
},
{
"completion_length": 2184.1122131347656,
"epoch": 0.4147764095917045,
"grad_norm": 0.19564213031602237,
"kl": 0.0068359375,
"learning_rate": 7.546596957641031e-07,
"loss": 0.0021,
"reward": 0.44793111085891724,
"reward_std": 0.25819646567106247,
"rewards/improved_len_reward_dast": 0.44793111085891724,
"step": 160
},
{
"completion_length": 2137.8213500976562,
"epoch": 0.41736876215165264,
"grad_norm": 0.18907736107190778,
"kl": 0.0077056884765625,
"learning_rate": 7.510124438752432e-07,
"loss": 0.0214,
"reward": 0.49345044791698456,
"reward_std": 0.2429029531776905,
"rewards/improved_len_reward_dast": 0.49345044791698456,
"step": 161
},
{
"completion_length": 1669.6938171386719,
"epoch": 0.4199611147116008,
"grad_norm": 0.1907714193828339,
"kl": 0.0060577392578125,
"learning_rate": 7.473486202693949e-07,
"loss": 0.0435,
"reward": 0.664195254445076,
"reward_std": 0.20142245292663574,
"rewards/improved_len_reward_dast": 0.664195254445076,
"step": 162
},
{
"completion_length": 1800.2550659179688,
"epoch": 0.4225534672715489,
"grad_norm": 0.20951161548060418,
"kl": 0.00634002685546875,
"learning_rate": 7.43668526996753e-07,
"loss": 0.0197,
"reward": 0.4569202698767185,
"reward_std": 0.27463599294424057,
"rewards/improved_len_reward_dast": 0.4569202698767185,
"step": 163
},
{
"completion_length": 1960.5714111328125,
"epoch": 0.4251458198314971,
"grad_norm": 0.17463868980210742,
"kl": 0.00701141357421875,
"learning_rate": 7.399724674488046e-07,
"loss": -0.0162,
"reward": 0.4838453456759453,
"reward_std": 0.241712786257267,
"rewards/improved_len_reward_dast": 0.4838453456759453,
"step": 164
},
{
"completion_length": 1959.688735961914,
"epoch": 0.42773817239144524,
"grad_norm": 0.1780914212189235,
"kl": 0.00678253173828125,
"learning_rate": 7.36260746333316e-07,
"loss": 0.0377,
"reward": 0.4423503875732422,
"reward_std": 0.18371517956256866,
"rewards/improved_len_reward_dast": 0.4423503875732422,
"step": 165
},
{
"completion_length": 1861.6989440917969,
"epoch": 0.4303305249513934,
"grad_norm": 0.1834193572229538,
"kl": 0.0063629150390625,
"learning_rate": 7.325336696492128e-07,
"loss": 0.0199,
"reward": 0.577091321349144,
"reward_std": 0.21984241902828217,
"rewards/improved_len_reward_dast": 0.577091321349144,
"step": 166
},
{
"completion_length": 2081.637664794922,
"epoch": 0.43292287751134156,
"grad_norm": 0.16721105851518456,
"kl": 0.00749969482421875,
"learning_rate": 7.287915446613531e-07,
"loss": 0.026,
"reward": 0.45866213738918304,
"reward_std": 0.20599739998579025,
"rewards/improved_len_reward_dast": 0.45866213738918304,
"step": 167
},
{
"completion_length": 2199.9692993164062,
"epoch": 0.43551523007128967,
"grad_norm": 0.19137635279403245,
"kl": 0.00868988037109375,
"learning_rate": 7.250346798751953e-07,
"loss": 0.0397,
"reward": 0.40584760159254074,
"reward_std": 0.32696348428726196,
"rewards/improved_len_reward_dast": 0.40584760159254074,
"step": 168
},
{
"completion_length": 1805.3213958740234,
"epoch": 0.43810758263123784,
"grad_norm": 0.21092807084032023,
"kl": 0.0059967041015625,
"learning_rate": 7.212633850113662e-07,
"loss": 0.0254,
"reward": 0.4987664595246315,
"reward_std": 0.2146843560039997,
"rewards/improved_len_reward_dast": 0.4987664595246315,
"step": 169
},
{
"completion_length": 1496.5714111328125,
"epoch": 0.440699935191186,
"grad_norm": 0.209791586936155,
"kl": 0.0062408447265625,
"learning_rate": 7.174779709801253e-07,
"loss": -0.0119,
"reward": 0.4626496955752373,
"reward_std": 0.27531013265252113,
"rewards/improved_len_reward_dast": 0.4626496955752373,
"step": 170
},
{
"completion_length": 1733.1683044433594,
"epoch": 0.44329228775113416,
"grad_norm": 0.17464526469335237,
"kl": 0.00637054443359375,
"learning_rate": 7.136787498557344e-07,
"loss": 0.0115,
"reward": 0.47377418726682663,
"reward_std": 0.31614845246076584,
"rewards/improved_len_reward_dast": 0.47377418726682663,
"step": 171
},
{
"completion_length": 1834.5152893066406,
"epoch": 0.4458846403110823,
"grad_norm": 0.2156862895463155,
"kl": 0.00728607177734375,
"learning_rate": 7.098660348507293e-07,
"loss": 0.0408,
"reward": 0.4768953248858452,
"reward_std": 0.24819828569889069,
"rewards/improved_len_reward_dast": 0.4768953248858452,
"step": 172
},
{
"completion_length": 1937.4999694824219,
"epoch": 0.44847699287103043,
"grad_norm": 0.16621803638711263,
"kl": 0.00614166259765625,
"learning_rate": 7.060401402900977e-07,
"loss": -0.0075,
"reward": 0.41049597412347794,
"reward_std": 0.295925073325634,
"rewards/improved_len_reward_dast": 0.41049597412347794,
"step": 173
},
{
"completion_length": 1872.5152893066406,
"epoch": 0.4510693454309786,
"grad_norm": 0.17350342194775273,
"kl": 0.0063323974609375,
"learning_rate": 7.022013815853672e-07,
"loss": -0.0071,
"reward": 0.4474351555109024,
"reward_std": 0.2942516505718231,
"rewards/improved_len_reward_dast": 0.4474351555109024,
"step": 174
},
{
"completion_length": 1865.642822265625,
"epoch": 0.45366169799092676,
"grad_norm": 0.18659528012982252,
"kl": 0.007171630859375,
"learning_rate": 6.983500752086006e-07,
"loss": 0.0382,
"reward": 0.4795069247484207,
"reward_std": 0.25236207991838455,
"rewards/improved_len_reward_dast": 0.4795069247484207,
"step": 175
},
{
"completion_length": 1636.3724365234375,
"epoch": 0.4562540505508749,
"grad_norm": 0.23440488356464137,
"kl": 0.00577545166015625,
"learning_rate": 6.94486538666307e-07,
"loss": 0.0499,
"reward": 0.5080657936632633,
"reward_std": 0.23907579854130745,
"rewards/improved_len_reward_dast": 0.5080657936632633,
"step": 176
},
{
"completion_length": 1736.0662841796875,
"epoch": 0.4588464031108231,
"grad_norm": 0.1753900070568452,
"kl": 0.00612640380859375,
"learning_rate": 6.906110904732656e-07,
"loss": 0.0149,
"reward": 0.5958031266927719,
"reward_std": 0.21545593440532684,
"rewards/improved_len_reward_dast": 0.5958031266927719,
"step": 177
},
{
"completion_length": 2228.107147216797,
"epoch": 0.46143875567077125,
"grad_norm": 0.15717817851479735,
"kl": 0.0067901611328125,
"learning_rate": 6.867240501262666e-07,
"loss": 0.0356,
"reward": 0.4543240964412689,
"reward_std": 0.22809231281280518,
"rewards/improved_len_reward_dast": 0.4543240964412689,
"step": 178
},
{
"completion_length": 1626.4897766113281,
"epoch": 0.46403110823071936,
"grad_norm": 0.18177589779485429,
"kl": 0.00614166259765625,
"learning_rate": 6.828257380777723e-07,
"loss": -0.0089,
"reward": 0.31600036658346653,
"reward_std": 0.3029540926218033,
"rewards/improved_len_reward_dast": 0.31600036658346653,
"step": 179
},
{
"completion_length": 1958.8315734863281,
"epoch": 0.4666234607906675,
"grad_norm": 0.1755764816680188,
"kl": 0.00868988037109375,
"learning_rate": 6.789164757094978e-07,
"loss": -0.0072,
"reward": 0.49836502969264984,
"reward_std": 0.23551873490214348,
"rewards/improved_len_reward_dast": 0.49836502969264984,
"step": 180
},
{
"completion_length": 1990.5101623535156,
"epoch": 0.4692158133506157,
"grad_norm": 0.18931918239703327,
"kl": 0.00846099853515625,
"learning_rate": 6.749965853059164e-07,
"loss": 0.0002,
"reward": 0.4911562129855156,
"reward_std": 0.3188675567507744,
"rewards/improved_len_reward_dast": 0.4911562129855156,
"step": 181
},
{
"completion_length": 1658.4642028808594,
"epoch": 0.47180816591056385,
"grad_norm": 0.19209648070470173,
"kl": 0.0060577392578125,
"learning_rate": 6.710663900276903e-07,
"loss": -0.0146,
"reward": 0.43227584287524223,
"reward_std": 0.2176770530641079,
"rewards/improved_len_reward_dast": 0.43227584287524223,
"step": 182
},
{
"completion_length": 1583.1224365234375,
"epoch": 0.474400518470512,
"grad_norm": 0.2038966371816177,
"kl": 0.00626373291015625,
"learning_rate": 6.671262138850274e-07,
"loss": 0.0399,
"reward": 0.5645861923694611,
"reward_std": 0.23070186376571655,
"rewards/improved_len_reward_dast": 0.5645861923694611,
"step": 183
},
{
"completion_length": 1520.1683044433594,
"epoch": 0.4769928710304601,
"grad_norm": 0.18304431320557107,
"kl": 0.006195068359375,
"learning_rate": 6.631763817109717e-07,
"loss": 0.0255,
"reward": 0.5742315426468849,
"reward_std": 0.23619792237877846,
"rewards/improved_len_reward_dast": 0.5742315426468849,
"step": 184
},
{
"completion_length": 1594.3519897460938,
"epoch": 0.4795852235904083,
"grad_norm": 0.16992347566951083,
"kl": 0.0060272216796875,
"learning_rate": 6.592172191346218e-07,
"loss": 0.0111,
"reward": 0.5422097221016884,
"reward_std": 0.25979943573474884,
"rewards/improved_len_reward_dast": 0.5422097221016884,
"step": 185
},
{
"completion_length": 1702.4183654785156,
"epoch": 0.48217757615035645,
"grad_norm": 0.20895915866852888,
"kl": 0.00782012939453125,
"learning_rate": 6.552490525542864e-07,
"loss": -0.0109,
"reward": 0.49483248591423035,
"reward_std": 0.18612295389175415,
"rewards/improved_len_reward_dast": 0.49483248591423035,
"step": 186
},
{
"completion_length": 1660.1836547851562,
"epoch": 0.4847699287103046,
"grad_norm": 0.177095187259404,
"kl": 0.00647735595703125,
"learning_rate": 6.512722091105757e-07,
"loss": 0.0079,
"reward": 0.41740237921476364,
"reward_std": 0.3016924597322941,
"rewards/improved_len_reward_dast": 0.41740237921476364,
"step": 187
},
{
"completion_length": 1430.4846649169922,
"epoch": 0.4873622812702528,
"grad_norm": 0.1856358739213989,
"kl": 0.00666046142578125,
"learning_rate": 6.472870166594314e-07,
"loss": 0.0043,
"reward": 0.6006389036774635,
"reward_std": 0.20018238201737404,
"rewards/improved_len_reward_dast": 0.6006389036774635,
"step": 188
},
{
"completion_length": 1959.5305786132812,
"epoch": 0.4899546338302009,
"grad_norm": 0.2109811675895551,
"kl": 0.00728607177734375,
"learning_rate": 6.432938037450974e-07,
"loss": 0.0321,
"reward": 0.4208461381494999,
"reward_std": 0.25322337821125984,
"rewards/improved_len_reward_dast": 0.4208461381494999,
"step": 189
},
{
"completion_length": 1917.4489135742188,
"epoch": 0.49254698639014904,
"grad_norm": 0.17048136291778715,
"kl": 0.00824737548828125,
"learning_rate": 6.392928995730352e-07,
"loss": 0.0145,
"reward": 0.4505029022693634,
"reward_std": 0.2619488127529621,
"rewards/improved_len_reward_dast": 0.4505029022693634,
"step": 190
},
{
"completion_length": 1402.14794921875,
"epoch": 0.4951393389500972,
"grad_norm": 0.18497170613874878,
"kl": 0.00598907470703125,
"learning_rate": 6.352846339827826e-07,
"loss": -0.0062,
"reward": 0.5656020939350128,
"reward_std": 0.21923119574785233,
"rewards/improved_len_reward_dast": 0.5656020939350128,
"step": 191
},
{
"completion_length": 1465.3367004394531,
"epoch": 0.49773169151004537,
"grad_norm": 0.21257458462794812,
"kl": 0.00757598876953125,
"learning_rate": 6.312693374207627e-07,
"loss": 0.0312,
"reward": 0.5634729117155075,
"reward_std": 0.2284911945462227,
"rewards/improved_len_reward_dast": 0.5634729117155075,
"step": 192
},
{
"completion_length": 1696.1326293945312,
"epoch": 0.5003240440699935,
"grad_norm": 0.1917176386219813,
"kl": 0.00817108154296875,
"learning_rate": 6.272473409130397e-07,
"loss": -0.0122,
"reward": 0.5527424663305283,
"reward_std": 0.21831231378018856,
"rewards/improved_len_reward_dast": 0.5527424663305283,
"step": 193
},
{
"completion_length": 1484.0254821777344,
"epoch": 0.5029163966299417,
"grad_norm": 0.18489831646493407,
"kl": 0.00533294677734375,
"learning_rate": 6.232189760380301e-07,
"loss": -0.0092,
"reward": 0.5197786688804626,
"reward_std": 0.2706274203956127,
"rewards/improved_len_reward_dast": 0.5197786688804626,
"step": 194
},
{
"completion_length": 1797.8571166992188,
"epoch": 0.5055087491898899,
"grad_norm": 0.2022876784231448,
"kl": 0.0066070556640625,
"learning_rate": 6.191845748991671e-07,
"loss": 0.0034,
"reward": 0.5193638280034065,
"reward_std": 0.15613215044140816,
"rewards/improved_len_reward_dast": 0.5193638280034065,
"step": 195
},
{
"completion_length": 1814.0611877441406,
"epoch": 0.508101101749838,
"grad_norm": 0.16795830607580578,
"kl": 0.00803375244140625,
"learning_rate": 6.151444700975203e-07,
"loss": 0.0087,
"reward": 0.5806097835302353,
"reward_std": 0.21784314513206482,
"rewards/improved_len_reward_dast": 0.5806097835302353,
"step": 196
},
{
"completion_length": 2092.0663146972656,
"epoch": 0.5106934543097861,
"grad_norm": 0.18162024484286257,
"kl": 0.007904052734375,
"learning_rate": 6.110989947043767e-07,
"loss": 0.0292,
"reward": 0.3626396246254444,
"reward_std": 0.2863907441496849,
"rewards/improved_len_reward_dast": 0.3626396246254444,
"step": 197
},
{
"completion_length": 1512.7142333984375,
"epoch": 0.5132858068697342,
"grad_norm": 0.25742266689895515,
"kl": 0.0079498291015625,
"learning_rate": 6.070484822337816e-07,
"loss": 0.0789,
"reward": 0.4790092930197716,
"reward_std": 0.24650050699710846,
"rewards/improved_len_reward_dast": 0.4790092930197716,
"step": 198
},
{
"completion_length": 1804.9438171386719,
"epoch": 0.5158781594296824,
"grad_norm": 0.1817030599796311,
"kl": 0.00815582275390625,
"learning_rate": 6.029932666150431e-07,
"loss": 0.0508,
"reward": 0.45782896876335144,
"reward_std": 0.257952194660902,
"rewards/improved_len_reward_dast": 0.45782896876335144,
"step": 199
},
{
"completion_length": 1632.5561218261719,
"epoch": 0.5184705119896306,
"grad_norm": 0.1682470886081435,
"kl": 0.00696563720703125,
"learning_rate": 5.989336821652029e-07,
"loss": 0.0123,
"reward": 0.49326401203870773,
"reward_std": 0.2727998159825802,
"rewards/improved_len_reward_dast": 0.49326401203870773,
"step": 200
},
{
"completion_length": 1362.3570861816406,
"epoch": 0.5210628645495787,
"grad_norm": 0.2140704882737376,
"kl": 0.00643157958984375,
"learning_rate": 5.948700635614745e-07,
"loss": 0.0187,
"reward": 0.30808811727911234,
"reward_std": 0.31461621820926666,
"rewards/improved_len_reward_dast": 0.30808811727911234,
"step": 201
},
{
"completion_length": 1429.9183349609375,
"epoch": 0.5236552171095269,
"grad_norm": 0.19196609876831122,
"kl": 0.0064544677734375,
"learning_rate": 5.908027458136518e-07,
"loss": 0.0442,
"reward": 0.6718090772628784,
"reward_std": 0.19283609837293625,
"rewards/improved_len_reward_dast": 0.6718090772628784,
"step": 202
},
{
"completion_length": 1489.9336242675781,
"epoch": 0.5262475696694751,
"grad_norm": 0.15253854414767434,
"kl": 0.00498199462890625,
"learning_rate": 5.867320642364916e-07,
"loss": 0.013,
"reward": 0.6136546954512596,
"reward_std": 0.23966671898961067,
"rewards/improved_len_reward_dast": 0.6136546954512596,
"step": 203
},
{
"completion_length": 1689.6377563476562,
"epoch": 0.5288399222294232,
"grad_norm": 0.1927055929645657,
"kl": 0.00634002685546875,
"learning_rate": 5.826583544220678e-07,
"loss": -0.0038,
"reward": 0.49707163125276566,
"reward_std": 0.2744743190705776,
"rewards/improved_len_reward_dast": 0.49707163125276566,
"step": 204
},
{
"completion_length": 1282.392837524414,
"epoch": 0.5314322747893714,
"grad_norm": 0.17997245193338135,
"kl": 0.0058441162109375,
"learning_rate": 5.78581952212107e-07,
"loss": 0.0106,
"reward": 0.5941964462399483,
"reward_std": 0.18648012727499008,
"rewards/improved_len_reward_dast": 0.5941964462399483,
"step": 205
},
{
"completion_length": 1697.3724670410156,
"epoch": 0.5340246273493195,
"grad_norm": 0.17197705412944048,
"kl": 0.00629425048828125,
"learning_rate": 5.745031936702997e-07,
"loss": -0.0017,
"reward": 0.47735022753477097,
"reward_std": 0.23548034578561783,
"rewards/improved_len_reward_dast": 0.47735022753477097,
"step": 206
},
{
"completion_length": 1744.994873046875,
"epoch": 0.5366169799092677,
"grad_norm": 0.17998891738721526,
"kl": 0.00860595703125,
"learning_rate": 5.704224150545956e-07,
"loss": 0.015,
"reward": 0.41617942601442337,
"reward_std": 0.2671221233904362,
"rewards/improved_len_reward_dast": 0.41617942601442337,
"step": 207
},
{
"completion_length": 1260.086685180664,
"epoch": 0.5392093324692158,
"grad_norm": 0.20122932764023702,
"kl": 0.005645751953125,
"learning_rate": 5.663399527894816e-07,
"loss": 0.0215,
"reward": 0.6909545063972473,
"reward_std": 0.20719094015657902,
"rewards/improved_len_reward_dast": 0.6909545063972473,
"step": 208
},
{
"completion_length": 1616.2958984375,
"epoch": 0.5418016850291639,
"grad_norm": 0.1906417060993576,
"kl": 0.00637054443359375,
"learning_rate": 5.622561434382467e-07,
"loss": 0.0233,
"reward": 0.45805612206459045,
"reward_std": 0.25512534007430077,
"rewards/improved_len_reward_dast": 0.45805612206459045,
"step": 209
},
{
"completion_length": 2039.3724365234375,
"epoch": 0.5443940375891121,
"grad_norm": 0.19498631453970255,
"kl": 0.00815582275390625,
"learning_rate": 5.581713236752361e-07,
"loss": -0.0184,
"reward": 0.46333739161491394,
"reward_std": 0.27003057673573494,
"rewards/improved_len_reward_dast": 0.46333739161491394,
"step": 210
},
{
"completion_length": 1523.2754821777344,
"epoch": 0.5469863901490603,
"grad_norm": 0.20159676101246632,
"kl": 0.006622314453125,
"learning_rate": 5.540858302580934e-07,
"loss": 0.0411,
"reward": 0.5782179459929466,
"reward_std": 0.2389019876718521,
"rewards/improved_len_reward_dast": 0.5782179459929466,
"step": 211
},
{
"completion_length": 1725.341796875,
"epoch": 0.5495787427090084,
"grad_norm": 0.18132979624264559,
"kl": 0.0070343017578125,
"learning_rate": 5.5e-07,
"loss": -0.0379,
"reward": 0.2929552085697651,
"reward_std": 0.2936428487300873,
"rewards/improved_len_reward_dast": 0.2929552085697651,
"step": 212
},
{
"completion_length": 1522.3009948730469,
"epoch": 0.5521710952689566,
"grad_norm": 1.2243596579933262,
"kl": 0.01662445068359375,
"learning_rate": 5.459141697419066e-07,
"loss": 0.0108,
"reward": 0.5409562550485134,
"reward_std": 0.22488684952259064,
"rewards/improved_len_reward_dast": 0.5409562550485134,
"step": 213
},
{
"completion_length": 1319.0765075683594,
"epoch": 0.5547634478289047,
"grad_norm": 0.1722252949208986,
"kl": 0.00470733642578125,
"learning_rate": 5.418286763247641e-07,
"loss": 0.0069,
"reward": 0.6275194361805916,
"reward_std": 0.2474201563745737,
"rewards/improved_len_reward_dast": 0.6275194361805916,
"step": 214
},
{
"completion_length": 1788.4846801757812,
"epoch": 0.5573558003888529,
"grad_norm": 1.2611041670931815,
"kl": 0.013458251953125,
"learning_rate": 5.377438565617532e-07,
"loss": -0.0125,
"reward": 0.45079565048217773,
"reward_std": 0.280511736869812,
"rewards/improved_len_reward_dast": 0.45079565048217773,
"step": 215
},
{
"completion_length": 2106.933624267578,
"epoch": 0.5599481529488011,
"grad_norm": 0.19693707615605538,
"kl": 0.00965118408203125,
"learning_rate": 5.336600472105186e-07,
"loss": 0.0099,
"reward": 0.44794752448797226,
"reward_std": 0.2532258592545986,
"rewards/improved_len_reward_dast": 0.44794752448797226,
"step": 216
},
{
"completion_length": 1594.5458984375,
"epoch": 0.5625405055087492,
"grad_norm": 0.18133552112289136,
"kl": 0.0072021484375,
"learning_rate": 5.295775849454045e-07,
"loss": 0.0022,
"reward": 0.35640399530529976,
"reward_std": 0.22871940955519676,
"rewards/improved_len_reward_dast": 0.35640399530529976,
"step": 217
},
{
"completion_length": 2063.7244262695312,
"epoch": 0.5651328580686974,
"grad_norm": 0.1653596633833327,
"kl": 0.0097503662109375,
"learning_rate": 5.254968063297003e-07,
"loss": -0.0067,
"reward": 0.39714931696653366,
"reward_std": 0.2558470740914345,
"rewards/improved_len_reward_dast": 0.39714931696653366,
"step": 218
},
{
"completion_length": 1957.7550659179688,
"epoch": 0.5677252106286454,
"grad_norm": 0.17831099916308876,
"kl": 0.008544921875,
"learning_rate": 5.214180477878931e-07,
"loss": 0.0054,
"reward": 0.4488506466150284,
"reward_std": 0.28022897988557816,
"rewards/improved_len_reward_dast": 0.4488506466150284,
"step": 219
},
{
"completion_length": 2230.1275024414062,
"epoch": 0.5703175631885936,
"grad_norm": 0.23765533370039002,
"kl": 0.01194000244140625,
"learning_rate": 5.173416455779323e-07,
"loss": -0.0122,
"reward": 0.4094167836010456,
"reward_std": 0.2675712872296572,
"rewards/improved_len_reward_dast": 0.4094167836010456,
"step": 220
},
{
"completion_length": 1626.5203857421875,
"epoch": 0.5729099157485418,
"grad_norm": 0.2314899179996737,
"kl": 0.00766754150390625,
"learning_rate": 5.132679357635086e-07,
"loss": 0.0299,
"reward": 0.46570489555597305,
"reward_std": 0.21487142890691757,
"rewards/improved_len_reward_dast": 0.46570489555597305,
"step": 221
},
{
"completion_length": 1790.2397766113281,
"epoch": 0.5755022683084899,
"grad_norm": 0.1860804689790846,
"kl": 0.00777435302734375,
"learning_rate": 5.091972541863481e-07,
"loss": 0.0162,
"reward": 0.42583951354026794,
"reward_std": 0.28225597366690636,
"rewards/improved_len_reward_dast": 0.42583951354026794,
"step": 222
},
{
"completion_length": 1288.7959289550781,
"epoch": 0.5780946208684381,
"grad_norm": 0.20669865534482024,
"kl": 0.0060882568359375,
"learning_rate": 5.051299364385257e-07,
"loss": 0.0231,
"reward": 0.5595748201012611,
"reward_std": 0.2434106133878231,
"rewards/improved_len_reward_dast": 0.5595748201012611,
"step": 223
},
{
"completion_length": 2255.5509643554688,
"epoch": 0.5806869734283863,
"grad_norm": 0.18272410031873515,
"kl": 0.01136016845703125,
"learning_rate": 5.010663178347971e-07,
"loss": 0.0234,
"reward": 0.47463729977607727,
"reward_std": 0.2821599170565605,
"rewards/improved_len_reward_dast": 0.47463729977607727,
"step": 224
},
{
"completion_length": 1452.3367004394531,
"epoch": 0.5832793259883344,
"grad_norm": 0.191897675765259,
"kl": 0.007843017578125,
"learning_rate": 4.970067333849568e-07,
"loss": 0.0344,
"reward": 0.502272866666317,
"reward_std": 0.22649240121245384,
"rewards/improved_len_reward_dast": 0.502272866666317,
"step": 225
},
{
"completion_length": 1654.5408020019531,
"epoch": 0.5858716785482826,
"grad_norm": 3.553604338224093,
"kl": 0.09326171875,
"learning_rate": 4.929515177662182e-07,
"loss": 0.0222,
"reward": 0.47584168612957,
"reward_std": 0.2622257173061371,
"rewards/improved_len_reward_dast": 0.47584168612957,
"step": 226
},
{
"completion_length": 1871.4132385253906,
"epoch": 0.5884640311082308,
"grad_norm": 0.1695314613295998,
"kl": 0.0087432861328125,
"learning_rate": 4.889010052956233e-07,
"loss": 0.0052,
"reward": 0.5272805392742157,
"reward_std": 0.2442457675933838,
"rewards/improved_len_reward_dast": 0.5272805392742157,
"step": 227
},
{
"completion_length": 1353.8213958740234,
"epoch": 0.5910563836681789,
"grad_norm": 0.18264539162380725,
"kl": 0.0059661865234375,
"learning_rate": 4.848555299024798e-07,
"loss": 0.0225,
"reward": 0.6240374892950058,
"reward_std": 0.1938109789043665,
"rewards/improved_len_reward_dast": 0.6240374892950058,
"step": 228
},
{
"completion_length": 1727.3112182617188,
"epoch": 0.593648736228127,
"grad_norm": 0.1734617136793063,
"kl": 0.00876617431640625,
"learning_rate": 4.80815425100833e-07,
"loss": 0.0211,
"reward": 0.5374634936451912,
"reward_std": 0.22801294550299644,
"rewards/improved_len_reward_dast": 0.5374634936451912,
"step": 229
},
{
"completion_length": 1789.8826293945312,
"epoch": 0.5962410887880751,
"grad_norm": 0.18912805948764874,
"kl": 0.0084228515625,
"learning_rate": 4.7678102396196983e-07,
"loss": -0.0126,
"reward": 0.4900341257452965,
"reward_std": 0.22806890308856964,
"rewards/improved_len_reward_dast": 0.4900341257452965,
"step": 230
},
{
"completion_length": 1693.0509948730469,
"epoch": 0.5988334413480233,
"grad_norm": 0.17597707523302372,
"kl": 0.00720977783203125,
"learning_rate": 4.727526590869605e-07,
"loss": -0.012,
"reward": 0.47888991981744766,
"reward_std": 0.2167413793504238,
"rewards/improved_len_reward_dast": 0.47888991981744766,
"step": 231
},
{
"completion_length": 1440.198959350586,
"epoch": 0.6014257939079715,
"grad_norm": 0.3232116423618759,
"kl": 0.00661468505859375,
"learning_rate": 4.6873066257923735e-07,
"loss": -0.0066,
"reward": 0.4444720149040222,
"reward_std": 0.1700468622148037,
"rewards/improved_len_reward_dast": 0.4444720149040222,
"step": 232
},
{
"completion_length": 2216.800994873047,
"epoch": 0.6040181464679196,
"grad_norm": 0.22016095164256272,
"kl": 0.00894927978515625,
"learning_rate": 4.647153660172173e-07,
"loss": 0.0621,
"reward": 0.46872628480196,
"reward_std": 0.26834653317928314,
"rewards/improved_len_reward_dast": 0.46872628480196,
"step": 233
},
{
"completion_length": 1988.7550659179688,
"epoch": 0.6066104990278678,
"grad_norm": 0.20038227395204466,
"kl": 0.01000213623046875,
"learning_rate": 4.607071004269647e-07,
"loss": 0.0274,
"reward": 0.5112068131566048,
"reward_std": 0.2712997607886791,
"rewards/improved_len_reward_dast": 0.5112068131566048,
"step": 234
},
{
"completion_length": 1631.9132385253906,
"epoch": 0.609202851587816,
"grad_norm": 0.20175936218365448,
"kl": 0.00800323486328125,
"learning_rate": 4.567061962549025e-07,
"loss": -0.0159,
"reward": 0.5118747428059578,
"reward_std": 0.27685124427080154,
"rewards/improved_len_reward_dast": 0.5118747428059578,
"step": 235
},
{
"completion_length": 1950.7857055664062,
"epoch": 0.6117952041477641,
"grad_norm": 0.18134020080561208,
"kl": 0.0106353759765625,
"learning_rate": 4.527129833405687e-07,
"loss": 0.0038,
"reward": 0.5120773538947105,
"reward_std": 0.20266427472233772,
"rewards/improved_len_reward_dast": 0.5120773538947105,
"step": 236
},
{
"completion_length": 1583.6836242675781,
"epoch": 0.6143875567077123,
"grad_norm": 0.2073936085236417,
"kl": 0.007110595703125,
"learning_rate": 4.4872779088942425e-07,
"loss": 0.0249,
"reward": 0.5029871687293053,
"reward_std": 0.29201821237802505,
"rewards/improved_len_reward_dast": 0.5029871687293053,
"step": 237
},
{
"completion_length": 1926.5203857421875,
"epoch": 0.6169799092676604,
"grad_norm": 0.1774462909740951,
"kl": 0.0105438232421875,
"learning_rate": 4.447509474457135e-07,
"loss": 0.0247,
"reward": 0.5557538792490959,
"reward_std": 0.26750218868255615,
"rewards/improved_len_reward_dast": 0.5557538792490959,
"step": 238
},
{
"completion_length": 1847.1377563476562,
"epoch": 0.6195722618276086,
"grad_norm": 0.19817543454609501,
"kl": 0.00870513916015625,
"learning_rate": 4.4078278086537823e-07,
"loss": 0.0044,
"reward": 0.5904448255896568,
"reward_std": 0.2602488324046135,
"rewards/improved_len_reward_dast": 0.5904448255896568,
"step": 239
},
{
"completion_length": 1411.2958984375,
"epoch": 0.6221646143875567,
"grad_norm": 0.1766269163216527,
"kl": 0.0061492919921875,
"learning_rate": 4.3682361828902846e-07,
"loss": 0.0087,
"reward": 0.5749641954898834,
"reward_std": 0.2569588888436556,
"rewards/improved_len_reward_dast": 0.5749641954898834,
"step": 240
},
{
"completion_length": 1658.341812133789,
"epoch": 0.6247569669475048,
"grad_norm": 0.18786331552156418,
"kl": 0.00792694091796875,
"learning_rate": 4.328737861149726e-07,
"loss": 0.0186,
"reward": 0.33391743153333664,
"reward_std": 0.2631943728774786,
"rewards/improved_len_reward_dast": 0.33391743153333664,
"step": 241
},
{
"completion_length": 1752.903060913086,
"epoch": 0.627349319507453,
"grad_norm": 0.16247838117797347,
"kl": 0.00801849365234375,
"learning_rate": 4.289336099723098e-07,
"loss": -0.0073,
"reward": 0.5536581799387932,
"reward_std": 0.1957964338362217,
"rewards/improved_len_reward_dast": 0.5536581799387932,
"step": 242
},
{
"completion_length": 1809.596923828125,
"epoch": 0.6299416720674011,
"grad_norm": 0.2161692721743295,
"kl": 0.009197235107421875,
"learning_rate": 4.250034146940834e-07,
"loss": 0.0363,
"reward": 0.5511080101132393,
"reward_std": 0.18104272708296776,
"rewards/improved_len_reward_dast": 0.5511080101132393,
"step": 243
},
{
"completion_length": 1582.2550964355469,
"epoch": 0.6325340246273493,
"grad_norm": 0.19382955679222108,
"kl": 0.00809478759765625,
"learning_rate": 4.210835242905023e-07,
"loss": 0.0326,
"reward": 0.5464158207178116,
"reward_std": 0.25605272501707077,
"rewards/improved_len_reward_dast": 0.5464158207178116,
"step": 244
},
{
"completion_length": 1810.8673400878906,
"epoch": 0.6351263771872975,
"grad_norm": 0.19640394391195734,
"kl": 0.00948333740234375,
"learning_rate": 4.1717426192222784e-07,
"loss": 0.0292,
"reward": 0.5271303877234459,
"reward_std": 0.20934263616800308,
"rewards/improved_len_reward_dast": 0.5271303877234459,
"step": 245
},
{
"completion_length": 2071.0305786132812,
"epoch": 0.6377187297472456,
"grad_norm": 0.2180548556252771,
"kl": 0.00945281982421875,
"learning_rate": 4.1327594987373347e-07,
"loss": 0.0067,
"reward": 0.38764588721096516,
"reward_std": 0.23858479037880898,
"rewards/improved_len_reward_dast": 0.38764588721096516,
"step": 246
},
{
"completion_length": 2013.1223754882812,
"epoch": 0.6403110823071938,
"grad_norm": 0.1930442243491151,
"kl": 0.00946044921875,
"learning_rate": 4.0938890952673443e-07,
"loss": -0.0222,
"reward": 0.46171685308218,
"reward_std": 0.18625032529234886,
"rewards/improved_len_reward_dast": 0.46171685308218,
"step": 247
},
{
"completion_length": 1950.23974609375,
"epoch": 0.642903434867142,
"grad_norm": 0.17065165751997421,
"kl": 0.01047515869140625,
"learning_rate": 4.05513461333693e-07,
"loss": 0.008,
"reward": 0.48094385862350464,
"reward_std": 0.25418727472424507,
"rewards/improved_len_reward_dast": 0.48094385862350464,
"step": 248
},
{
"completion_length": 1824.0305480957031,
"epoch": 0.6454957874270901,
"grad_norm": 0.17991006553556818,
"kl": 0.0101165771484375,
"learning_rate": 4.016499247913994e-07,
"loss": 0.0192,
"reward": 0.517502948641777,
"reward_std": 0.22622046247124672,
"rewards/improved_len_reward_dast": 0.517502948641777,
"step": 249
},
{
"completion_length": 1910.5152587890625,
"epoch": 0.6480881399870383,
"grad_norm": 0.17870026031791122,
"kl": 0.0102386474609375,
"learning_rate": 3.977986184146328e-07,
"loss": 0.0037,
"reward": 0.6127093955874443,
"reward_std": 0.24417436867952347,
"rewards/improved_len_reward_dast": 0.6127093955874443,
"step": 250
},
{
"completion_length": 1963.0713500976562,
"epoch": 0.6506804925469863,
"grad_norm": 0.17652108057579807,
"kl": 0.00982666015625,
"learning_rate": 3.939598597099022e-07,
"loss": -0.0145,
"reward": 0.31848039478063583,
"reward_std": 0.29098184034228325,
"rewards/improved_len_reward_dast": 0.31848039478063583,
"step": 251
},
{
"completion_length": 1713.6223754882812,
"epoch": 0.6532728451069345,
"grad_norm": 0.17142346849906642,
"kl": 0.00846099853515625,
"learning_rate": 3.9013396514927076e-07,
"loss": 0.0119,
"reward": 0.47325168550014496,
"reward_std": 0.24261003732681274,
"rewards/improved_len_reward_dast": 0.47325168550014496,
"step": 252
},
{
"completion_length": 2062.188751220703,
"epoch": 0.6558651976668827,
"grad_norm": 0.16335646698761644,
"kl": 0.0091094970703125,
"learning_rate": 3.8632125014426566e-07,
"loss": 0.0001,
"reward": 0.40878694504499435,
"reward_std": 0.24995128065347672,
"rewards/improved_len_reward_dast": 0.40878694504499435,
"step": 253
},
{
"completion_length": 2270.3978881835938,
"epoch": 0.6584575502268308,
"grad_norm": 0.16560269347700182,
"kl": 0.0117950439453125,
"learning_rate": 3.8252202901987474e-07,
"loss": 0.0271,
"reward": 0.46808916330337524,
"reward_std": 0.21613015979528427,
"rewards/improved_len_reward_dast": 0.46808916330337524,
"step": 254
},
{
"completion_length": 1911.0305786132812,
"epoch": 0.661049902786779,
"grad_norm": 0.18694188171422124,
"kl": 0.0100860595703125,
"learning_rate": 3.7873661498863384e-07,
"loss": -0.0122,
"reward": 0.5386775732040405,
"reward_std": 0.267782025039196,
"rewards/improved_len_reward_dast": 0.5386775732040405,
"step": 255
},
{
"completion_length": 1850.2805786132812,
"epoch": 0.6636422553467272,
"grad_norm": 99.3387811319855,
"kl": 0.239776611328125,
"learning_rate": 3.7496532012480463e-07,
"loss": 0.0278,
"reward": 0.4910132810473442,
"reward_std": 0.2469508834183216,
"rewards/improved_len_reward_dast": 0.4910132810473442,
"step": 256
},
{
"completion_length": 1687.4540405273438,
"epoch": 0.6662346079066753,
"grad_norm": 0.2081083237423023,
"kl": 0.01031494140625,
"learning_rate": 3.7120845533864706e-07,
"loss": 0.0474,
"reward": 0.5329510420560837,
"reward_std": 0.18445927649736404,
"rewards/improved_len_reward_dast": 0.5329510420560837,
"step": 257
},
{
"completion_length": 2054.6275024414062,
"epoch": 0.6688269604666235,
"grad_norm": 0.2198626965043421,
"kl": 0.0117645263671875,
"learning_rate": 3.6746633035078723e-07,
"loss": -0.0103,
"reward": 0.38417188823223114,
"reward_std": 0.19621288403868675,
"rewards/improved_len_reward_dast": 0.38417188823223114,
"step": 258
},
{
"completion_length": 1646.8213806152344,
"epoch": 0.6714193130265717,
"grad_norm": 0.19620593996030333,
"kl": 0.00890350341796875,
"learning_rate": 3.63739253666684e-07,
"loss": 0.0092,
"reward": 0.5395868346095085,
"reward_std": 0.2524537071585655,
"rewards/improved_len_reward_dast": 0.5395868346095085,
"step": 259
},
{
"completion_length": 2210.8162536621094,
"epoch": 0.6740116655865198,
"grad_norm": 0.18772992549784875,
"kl": 0.01015472412109375,
"learning_rate": 3.6002753255119533e-07,
"loss": 0.0418,
"reward": 0.5520248711109161,
"reward_std": 0.25951434671878815,
"rewards/improved_len_reward_dast": 0.5520248711109161,
"step": 260
},
{
"completion_length": 1637.6224060058594,
"epoch": 0.6766040181464679,
"grad_norm": 0.20139190537407078,
"kl": 0.009979248046875,
"learning_rate": 3.5633147300324706e-07,
"loss": 0.0317,
"reward": 0.47854653000831604,
"reward_std": 0.21838786266744137,
"rewards/improved_len_reward_dast": 0.47854653000831604,
"step": 261
},
{
"completion_length": 1974.5560302734375,
"epoch": 0.679196370706416,
"grad_norm": 0.1703155196433375,
"kl": 0.0100555419921875,
"learning_rate": 3.526513797306051e-07,
"loss": -0.0087,
"reward": 0.5659954845905304,
"reward_std": 0.22994915768504143,
"rewards/improved_len_reward_dast": 0.5659954845905304,
"step": 262
},
{
"completion_length": 2071.5663146972656,
"epoch": 0.6817887232663642,
"grad_norm": 0.16298809195534278,
"kl": 0.013153076171875,
"learning_rate": 3.489875561247568e-07,
"loss": 0.0145,
"reward": 0.46151311695575714,
"reward_std": 0.2671518959105015,
"rewards/improved_len_reward_dast": 0.46151311695575714,
"step": 263
},
{
"completion_length": 1674.2091064453125,
"epoch": 0.6843810758263124,
"grad_norm": 0.1864849140911452,
"kl": 0.00850677490234375,
"learning_rate": 3.453403042358968e-07,
"loss": 0.0185,
"reward": 0.5161371529102325,
"reward_std": 0.24183812364935875,
"rewards/improved_len_reward_dast": 0.5161371529102325,
"step": 264
},
{
"completion_length": 1826.3213806152344,
"epoch": 0.6869734283862605,
"grad_norm": 0.18926873912683365,
"kl": 0.0092010498046875,
"learning_rate": 3.417099247480277e-07,
"loss": 0.0219,
"reward": 0.440113328397274,
"reward_std": 0.24182692915201187,
"rewards/improved_len_reward_dast": 0.440113328397274,
"step": 265
},
{
"completion_length": 2168.744842529297,
"epoch": 0.6895657809462087,
"grad_norm": 0.18825197599044874,
"kl": 0.0110931396484375,
"learning_rate": 3.3809671695416916e-07,
"loss": 0.0291,
"reward": 0.5052645355463028,
"reward_std": 0.3056667521595955,
"rewards/improved_len_reward_dast": 0.5052645355463028,
"step": 266
},
{
"completion_length": 1406.0509643554688,
"epoch": 0.6921581335061568,
"grad_norm": 0.1830588242176062,
"kl": 0.00689697265625,
"learning_rate": 3.345009787316859e-07,
"loss": 0.0028,
"reward": 0.5441867634654045,
"reward_std": 0.21564403921365738,
"rewards/improved_len_reward_dast": 0.5441867634654045,
"step": 267
},
{
"completion_length": 1561.0254669189453,
"epoch": 0.694750486066105,
"grad_norm": 0.19864755916409904,
"kl": 0.00759124755859375,
"learning_rate": 3.309230065177289e-07,
"loss": 0.0233,
"reward": 0.6223798245191574,
"reward_std": 0.22251487523317337,
"rewards/improved_len_reward_dast": 0.6223798245191574,
"step": 268
},
{
"completion_length": 1677.2652893066406,
"epoch": 0.6973428386260532,
"grad_norm": 0.18729962591317764,
"kl": 0.009307861328125,
"learning_rate": 3.273630952847971e-07,
"loss": 0.0169,
"reward": 0.5602849051356316,
"reward_std": 0.20688385143876076,
"rewards/improved_len_reward_dast": 0.5602849051356316,
"step": 269
},
{
"completion_length": 1901.9795837402344,
"epoch": 0.6999351911860013,
"grad_norm": 0.1757643708818276,
"kl": 0.00939178466796875,
"learning_rate": 3.2382153851641996e-07,
"loss": 0.0048,
"reward": 0.4372241795063019,
"reward_std": 0.1733334343880415,
"rewards/improved_len_reward_dast": 0.4372241795063019,
"step": 270
},
{
"completion_length": 1864.9489440917969,
"epoch": 0.7025275437459495,
"grad_norm": 0.20979118436624683,
"kl": 0.011383056640625,
"learning_rate": 3.202986281829616e-07,
"loss": 0.0047,
"reward": 0.49054908007383347,
"reward_std": 0.2722769007086754,
"rewards/improved_len_reward_dast": 0.49054908007383347,
"step": 271
},
{
"completion_length": 1864.551025390625,
"epoch": 0.7051198963058976,
"grad_norm": 0.18787099752483769,
"kl": 0.0099334716796875,
"learning_rate": 3.1679465471755106e-07,
"loss": 0.0112,
"reward": 0.4509451389312744,
"reward_std": 0.2106573022902012,
"rewards/improved_len_reward_dast": 0.4509451389312744,
"step": 272
},
{
"completion_length": 2081.4693298339844,
"epoch": 0.7077122488658457,
"grad_norm": 0.17067985532424773,
"kl": 0.013275146484375,
"learning_rate": 3.1330990699213824e-07,
"loss": 0.0178,
"reward": 0.52352125197649,
"reward_std": 0.17880443297326565,
"rewards/improved_len_reward_dast": 0.52352125197649,
"step": 273
},
{
"completion_length": 1941.5101928710938,
"epoch": 0.7103046014257939,
"grad_norm": 0.20384047669640917,
"kl": 0.009765625,
"learning_rate": 3.0984467229367885e-07,
"loss": -0.0165,
"reward": 0.47794508188962936,
"reward_std": 0.16116551123559475,
"rewards/improved_len_reward_dast": 0.47794508188962936,
"step": 274
},
{
"completion_length": 1740.596923828125,
"epoch": 0.712896953985742,
"grad_norm": 0.16442269270533758,
"kl": 0.0075531005859375,
"learning_rate": 3.063992363004503e-07,
"loss": 0.023,
"reward": 0.6044076532125473,
"reward_std": 0.24043289944529533,
"rewards/improved_len_reward_dast": 0.6044076532125473,
"step": 275
},
{
"completion_length": 1916.8571166992188,
"epoch": 0.7154893065456902,
"grad_norm": 0.20495543381215128,
"kl": 0.00904083251953125,
"learning_rate": 3.0297388305850004e-07,
"loss": 0.017,
"reward": 0.46368006244301796,
"reward_std": 0.2539185471832752,
"rewards/improved_len_reward_dast": 0.46368006244301796,
"step": 276
},
{
"completion_length": 1812.142822265625,
"epoch": 0.7180816591056384,
"grad_norm": 0.25645445924847915,
"kl": 0.010040283203125,
"learning_rate": 2.9956889495822877e-07,
"loss": 0.0104,
"reward": 0.5476516783237457,
"reward_std": 0.24734269082546234,
"rewards/improved_len_reward_dast": 0.5476516783237457,
"step": 277
},
{
"completion_length": 1896.540771484375,
"epoch": 0.7206740116655865,
"grad_norm": 0.1648074844235057,
"kl": 0.0090789794921875,
"learning_rate": 2.961845527111091e-07,
"loss": 0.0101,
"reward": 0.4374995678663254,
"reward_std": 0.22751843184232712,
"rewards/improved_len_reward_dast": 0.4374995678663254,
"step": 278
},
{
"completion_length": 1794.9744567871094,
"epoch": 0.7232663642255347,
"grad_norm": 0.1965304858919259,
"kl": 0.009490966796875,
"learning_rate": 2.9282113532654363e-07,
"loss": 0.0269,
"reward": 0.6033914387226105,
"reward_std": 0.21910444274544716,
"rewards/improved_len_reward_dast": 0.6033914387226105,
"step": 279
},
{
"completion_length": 1833.938720703125,
"epoch": 0.7258587167854829,
"grad_norm": 0.22688751850212643,
"kl": 0.0119781494140625,
"learning_rate": 2.894789200888634e-07,
"loss": 0.0314,
"reward": 0.6300860643386841,
"reward_std": 0.20502058789134026,
"rewards/improved_len_reward_dast": 0.6300860643386841,
"step": 280
},
{
"completion_length": 1426.5050659179688,
"epoch": 0.728451069345431,
"grad_norm": 0.19933588720750745,
"kl": 0.0085296630859375,
"learning_rate": 2.8615818253446766e-07,
"loss": 0.0176,
"reward": 0.6437288224697113,
"reward_std": 0.18815965950489044,
"rewards/improved_len_reward_dast": 0.6437288224697113,
"step": 281
},
{
"completion_length": 1340.8316345214844,
"epoch": 0.7310434219053791,
"grad_norm": 0.17972587406933935,
"kl": 0.00726318359375,
"learning_rate": 2.828591964291093e-07,
"loss": 0.0208,
"reward": 0.4648343026638031,
"reward_std": 0.22194743156433105,
"rewards/improved_len_reward_dast": 0.4648343026638031,
"step": 282
},
{
"completion_length": 1601.1275329589844,
"epoch": 0.7336357744653272,
"grad_norm": 0.19127024584768593,
"kl": 0.00716400146484375,
"learning_rate": 2.7958223374532363e-07,
"loss": 0.0235,
"reward": 0.4880499690771103,
"reward_std": 0.27051419019699097,
"rewards/improved_len_reward_dast": 0.4880499690771103,
"step": 283
},
{
"completion_length": 1403.9591674804688,
"epoch": 0.7362281270252754,
"grad_norm": 0.17860683081831877,
"kl": 0.007843017578125,
"learning_rate": 2.7632756464000835e-07,
"loss": 0.0191,
"reward": 0.6974282413721085,
"reward_std": 0.17937561869621277,
"rewards/improved_len_reward_dast": 0.6974282413721085,
"step": 284
},
{
"completion_length": 2051.2550354003906,
"epoch": 0.7388204795852236,
"grad_norm": 0.19636082014864284,
"kl": 0.0125885009765625,
"learning_rate": 2.730954574321503e-07,
"loss": 0.0296,
"reward": 0.3826203756034374,
"reward_std": 0.2091355100274086,
"rewards/improved_len_reward_dast": 0.3826203756034374,
"step": 285
},
{
"completion_length": 1637.9949035644531,
"epoch": 0.7414128321451717,
"grad_norm": 0.19076788482154552,
"kl": 0.00902557373046875,
"learning_rate": 2.698861785807055e-07,
"loss": 0.0357,
"reward": 0.5993083268404007,
"reward_std": 0.26309484988451004,
"rewards/improved_len_reward_dast": 0.5993083268404007,
"step": 286
},
{
"completion_length": 1831.4847106933594,
"epoch": 0.7440051847051199,
"grad_norm": 0.17423608191943502,
"kl": 0.00811004638671875,
"learning_rate": 2.6669999266263154e-07,
"loss": -0.009,
"reward": 0.4810323938727379,
"reward_std": 0.2594267800450325,
"rewards/improved_len_reward_dast": 0.4810323938727379,
"step": 287
},
{
"completion_length": 1873.801025390625,
"epoch": 0.7465975372650681,
"grad_norm": 0.16706780039675176,
"kl": 0.0091552734375,
"learning_rate": 2.635371623510758e-07,
"loss": 0.013,
"reward": 0.39794730208814144,
"reward_std": 0.2275175377726555,
"rewards/improved_len_reward_dast": 0.39794730208814144,
"step": 288
},
{
"completion_length": 1484.2346649169922,
"epoch": 0.7491898898250162,
"grad_norm": 0.21586526660516042,
"kl": 0.0080718994140625,
"learning_rate": 2.6039794839372066e-07,
"loss": -0.0156,
"reward": 0.49782148748636246,
"reward_std": 0.24559944868087769,
"rewards/improved_len_reward_dast": 0.49782148748636246,
"step": 289
},
{
"completion_length": 1958.8162536621094,
"epoch": 0.7517822423849644,
"grad_norm": 0.17918596004756887,
"kl": 0.0090789794921875,
"learning_rate": 2.5728260959128614e-07,
"loss": 0.0274,
"reward": 0.5516902059316635,
"reward_std": 0.21921641565859318,
"rewards/improved_len_reward_dast": 0.5516902059316635,
"step": 290
},
{
"completion_length": 2349.9642028808594,
"epoch": 0.7543745949449125,
"grad_norm": 0.17462481339673772,
"kl": 0.01324462890625,
"learning_rate": 2.541914027761951e-07,
"loss": 0.038,
"reward": 0.46060309559106827,
"reward_std": 0.24067510664463043,
"rewards/improved_len_reward_dast": 0.46060309559106827,
"step": 291
},
{
"completion_length": 1819.5612182617188,
"epoch": 0.7569669475048607,
"grad_norm": 0.20515893392963483,
"kl": 0.0117034912109375,
"learning_rate": 2.511245827913991e-07,
"loss": 0.0134,
"reward": 0.5075127482414246,
"reward_std": 0.2391039952635765,
"rewards/improved_len_reward_dast": 0.5075127482414246,
"step": 292
},
{
"completion_length": 1869.4029846191406,
"epoch": 0.7595593000648088,
"grad_norm": 0.18937190499433085,
"kl": 0.00841522216796875,
"learning_rate": 2.4808240246936866e-07,
"loss": 0.0268,
"reward": 0.42616455629467964,
"reward_std": 0.248293437063694,
"rewards/improved_len_reward_dast": 0.42616455629467964,
"step": 293
},
{
"completion_length": 1934.89794921875,
"epoch": 0.7621516526247569,
"grad_norm": 0.21697154774685395,
"kl": 0.0114898681640625,
"learning_rate": 2.450651126112504e-07,
"loss": 0.0579,
"reward": 0.558953121304512,
"reward_std": 0.23153522983193398,
"rewards/improved_len_reward_dast": 0.558953121304512,
"step": 294
},
{
"completion_length": 1610.1019439697266,
"epoch": 0.7647440051847051,
"grad_norm": 0.2226928196281353,
"kl": 0.00940704345703125,
"learning_rate": 2.4207296196618924e-07,
"loss": 0.0635,
"reward": 0.5272797495126724,
"reward_std": 0.18615226447582245,
"rewards/improved_len_reward_dast": 0.5272797495126724,
"step": 295
},
{
"completion_length": 1119.1581420898438,
"epoch": 0.7673363577446533,
"grad_norm": 0.2027474565515649,
"kl": 0.006317138671875,
"learning_rate": 2.3910619721082253e-07,
"loss": 0.0278,
"reward": 0.48764973133802414,
"reward_std": 0.2782805897295475,
"rewards/improved_len_reward_dast": 0.48764973133802414,
"step": 296
},
{
"completion_length": 1616.4081420898438,
"epoch": 0.7699287103046014,
"grad_norm": 0.17541198964340385,
"kl": 0.0094146728515625,
"learning_rate": 2.3616506292894282e-07,
"loss": -0.0067,
"reward": 0.5815595760941505,
"reward_std": 0.25435012578964233,
"rewards/improved_len_reward_dast": 0.5815595760941505,
"step": 297
},
{
"completion_length": 1853.7652587890625,
"epoch": 0.7725210628645496,
"grad_norm": 0.16574059006766365,
"kl": 0.0091705322265625,
"learning_rate": 2.332498015913344e-07,
"loss": -0.0098,
"reward": 0.5380261987447739,
"reward_std": 0.21175834722816944,
"rewards/improved_len_reward_dast": 0.5380261987447739,
"step": 298
},
{
"completion_length": 1436.3621978759766,
"epoch": 0.7751134154244977,
"grad_norm": 0.18476117099686806,
"kl": 0.00868988037109375,
"learning_rate": 2.303606535357843e-07,
"loss": 0.0273,
"reward": 0.6286723613739014,
"reward_std": 0.21005361154675484,
"rewards/improved_len_reward_dast": 0.6286723613739014,
"step": 299
},
{
"completion_length": 1812.0917663574219,
"epoch": 0.7777057679844459,
"grad_norm": 0.22027403000618603,
"kl": 0.00989532470703125,
"learning_rate": 2.2749785694726685e-07,
"loss": 0.0398,
"reward": 0.5523130521178246,
"reward_std": 0.2311643809080124,
"rewards/improved_len_reward_dast": 0.5523130521178246,
"step": 300
},
{
"completion_length": 1550.091796875,
"epoch": 0.7802981205443941,
"grad_norm": 0.16674263883125018,
"kl": 0.00970458984375,
"learning_rate": 2.2466164783830972e-07,
"loss": 0.0133,
"reward": 0.5227341949939728,
"reward_std": 0.2449796199798584,
"rewards/improved_len_reward_dast": 0.5227341949939728,
"step": 301
},
{
"completion_length": 1884.5560607910156,
"epoch": 0.7828904731043422,
"grad_norm": 0.18268480428784603,
"kl": 0.00943756103515625,
"learning_rate": 2.2185226002953483e-07,
"loss": -0.0221,
"reward": 0.5044264793395996,
"reward_std": 0.2946867607533932,
"rewards/improved_len_reward_dast": 0.5044264793395996,
"step": 302
},
{
"completion_length": 1944.2856750488281,
"epoch": 0.7854828256642904,
"grad_norm": 0.20936164270865662,
"kl": 0.01123046875,
"learning_rate": 2.1906992513038268e-07,
"loss": 0.0225,
"reward": 0.4654003605246544,
"reward_std": 0.30387868732213974,
"rewards/improved_len_reward_dast": 0.4654003605246544,
"step": 303
},
{
"completion_length": 1758.4132385253906,
"epoch": 0.7880751782242384,
"grad_norm": 0.17935190379674407,
"kl": 0.010589599609375,
"learning_rate": 2.1631487252001822e-07,
"loss": 0.0077,
"reward": 0.5262851193547249,
"reward_std": 0.2472703866660595,
"rewards/improved_len_reward_dast": 0.5262851193547249,
"step": 304
},
{
"completion_length": 2335.3468627929688,
"epoch": 0.7906675307841866,
"grad_norm": 0.15643277467226446,
"kl": 0.012451171875,
"learning_rate": 2.1358732932842032e-07,
"loss": 0.0207,
"reward": 0.3303733505308628,
"reward_std": 0.25162431970238686,
"rewards/improved_len_reward_dast": 0.3303733505308628,
"step": 305
},
{
"completion_length": 1861.2805480957031,
"epoch": 0.7932598833441348,
"grad_norm": 0.1970163252686287,
"kl": 0.00946044921875,
"learning_rate": 2.1088752041765734e-07,
"loss": 0.0398,
"reward": 0.5499422550201416,
"reward_std": 0.2154020182788372,
"rewards/improved_len_reward_dast": 0.5499422550201416,
"step": 306
},
{
"completion_length": 1671.3672790527344,
"epoch": 0.7958522359040829,
"grad_norm": 0.19782131445031087,
"kl": 0.00991058349609375,
"learning_rate": 2.0821566836334847e-07,
"loss": 0.0267,
"reward": 0.5274748802185059,
"reward_std": 0.22224271297454834,
"rewards/improved_len_reward_dast": 0.5274748802185059,
"step": 307
},
{
"completion_length": 1504.4183044433594,
"epoch": 0.7984445884640311,
"grad_norm": 0.19970611340783334,
"kl": 0.00811767578125,
"learning_rate": 2.0557199343631494e-07,
"loss": 0.01,
"reward": 0.4628491848707199,
"reward_std": 0.26031310856342316,
"rewards/improved_len_reward_dast": 0.4628491848707199,
"step": 308
},
{
"completion_length": 1926.744873046875,
"epoch": 0.8010369410239793,
"grad_norm": 0.18231345453995476,
"kl": 0.011356353759765625,
"learning_rate": 2.0295671358442033e-07,
"loss": 0.0294,
"reward": 0.4870244786143303,
"reward_std": 0.27465204894542694,
"rewards/improved_len_reward_dast": 0.4870244786143303,
"step": 309
},
{
"completion_length": 1507.6785430908203,
"epoch": 0.8036292935839274,
"grad_norm": 0.19445651546193465,
"kl": 0.008941650390625,
"learning_rate": 2.0037004441460263e-07,
"loss": 0.0194,
"reward": 0.5780586749315262,
"reward_std": 0.20703395083546638,
"rewards/improved_len_reward_dast": 0.5780586749315262,
"step": 310
},
{
"completion_length": 1608.9642639160156,
"epoch": 0.8062216461438756,
"grad_norm": 0.16750814215077678,
"kl": 0.0088043212890625,
"learning_rate": 1.9781219917509987e-07,
"loss": 0.0281,
"reward": 0.600249782204628,
"reward_std": 0.19979360327124596,
"rewards/improved_len_reward_dast": 0.600249782204628,
"step": 311
},
{
"completion_length": 1835.9234619140625,
"epoch": 0.8088139987038238,
"grad_norm": 0.2213868690216617,
"kl": 0.010528564453125,
"learning_rate": 1.9528338873786882e-07,
"loss": 0.0167,
"reward": 0.459771279245615,
"reward_std": 0.2465382032096386,
"rewards/improved_len_reward_dast": 0.459771279245615,
"step": 312
},
{
"completion_length": 1742.1785583496094,
"epoch": 0.8114063512637719,
"grad_norm": 0.1996790847530568,
"kl": 0.0092620849609375,
"learning_rate": 1.9278382158120116e-07,
"loss": 0.028,
"reward": 0.5697463825345039,
"reward_std": 0.27643223106861115,
"rewards/improved_len_reward_dast": 0.5697463825345039,
"step": 313
},
{
"completion_length": 1522.6886901855469,
"epoch": 0.81399870382372,
"grad_norm": 0.17972132296325355,
"kl": 0.006275177001953125,
"learning_rate": 1.9031370377253574e-07,
"loss": 0.0038,
"reward": 0.6382875889539719,
"reward_std": 0.2210763283073902,
"rewards/improved_len_reward_dast": 0.6382875889539719,
"step": 314
},
{
"completion_length": 1819.9336242675781,
"epoch": 0.8165910563836681,
"grad_norm": 0.23940587104074545,
"kl": 0.010528564453125,
"learning_rate": 1.8787323895147052e-07,
"loss": -0.0003,
"reward": 0.3470681682229042,
"reward_std": 0.29173849523067474,
"rewards/improved_len_reward_dast": 0.3470681682229042,
"step": 315
},
{
"completion_length": 1632.4795532226562,
"epoch": 0.8191834089436163,
"grad_norm": 0.1952690422984032,
"kl": 0.0094451904296875,
"learning_rate": 1.8546262831297438e-07,
"loss": -0.007,
"reward": 0.6234361678361893,
"reward_std": 0.2323027402162552,
"rewards/improved_len_reward_dast": 0.6234361678361893,
"step": 316
},
{
"completion_length": 1467.7703552246094,
"epoch": 0.8217757615035645,
"grad_norm": 0.18915172276823475,
"kl": 0.0072021484375,
"learning_rate": 1.8308207059079938e-07,
"loss": -0.0214,
"reward": 0.4193726107478142,
"reward_std": 0.28542226925492287,
"rewards/improved_len_reward_dast": 0.4193726107478142,
"step": 317
},
{
"completion_length": 1709.9183044433594,
"epoch": 0.8243681140635126,
"grad_norm": 0.18717550388451976,
"kl": 0.0111846923828125,
"learning_rate": 1.8073176204109837e-07,
"loss": 0.0136,
"reward": 0.6079925745725632,
"reward_std": 0.1753884293138981,
"rewards/improved_len_reward_dast": 0.6079925745725632,
"step": 318
},
{
"completion_length": 1549.0101623535156,
"epoch": 0.8269604666234608,
"grad_norm": 0.21375644169275265,
"kl": 0.00875091552734375,
"learning_rate": 1.7841189642624428e-07,
"loss": -0.0289,
"reward": 0.48142021149396896,
"reward_std": 0.28746388852596283,
"rewards/improved_len_reward_dast": 0.48142021149396896,
"step": 319
},
{
"completion_length": 1784.19384765625,
"epoch": 0.829552819183409,
"grad_norm": 0.17176452506605447,
"kl": 0.0102691650390625,
"learning_rate": 1.7612266499885642e-07,
"loss": 0.0089,
"reward": 0.6086387038230896,
"reward_std": 0.22643940150737762,
"rewards/improved_len_reward_dast": 0.6086387038230896,
"step": 320
},
{
"completion_length": 1210.2295684814453,
"epoch": 0.8321451717433571,
"grad_norm": 0.2124277117191842,
"kl": 0.00737762451171875,
"learning_rate": 1.7386425648603354e-07,
"loss": 0.0397,
"reward": 0.6170787662267685,
"reward_std": 0.2108596581965685,
"rewards/improved_len_reward_dast": 0.6170787662267685,
"step": 321
},
{
"completion_length": 1488.0816040039062,
"epoch": 0.8347375243033053,
"grad_norm": 0.2120577617498992,
"kl": 0.00836181640625,
"learning_rate": 1.716368570737946e-07,
"loss": 0.0409,
"reward": 0.601994976401329,
"reward_std": 0.2291371487081051,
"rewards/improved_len_reward_dast": 0.601994976401329,
"step": 322
},
{
"completion_length": 1990.8111419677734,
"epoch": 0.8373298768632534,
"grad_norm": 0.15844383624247524,
"kl": 0.010498046875,
"learning_rate": 1.6944065039173004e-07,
"loss": -0.0043,
"reward": 0.3935827948153019,
"reward_std": 0.2862498462200165,
"rewards/improved_len_reward_dast": 0.3935827948153019,
"step": 323
},
{
"completion_length": 2001.5918273925781,
"epoch": 0.8399222294232016,
"grad_norm": 0.16218776295200982,
"kl": 0.011138916015625,
"learning_rate": 1.672758174978622e-07,
"loss": 0.0059,
"reward": 0.4957594498991966,
"reward_std": 0.22672280296683311,
"rewards/improved_len_reward_dast": 0.4957594498991966,
"step": 324
},
{
"completion_length": 1417.3622131347656,
"epoch": 0.8425145819831497,
"grad_norm": 0.2087617301180929,
"kl": 0.00780487060546875,
"learning_rate": 1.6514253686371917e-07,
"loss": 0.0289,
"reward": 0.5871463492512703,
"reward_std": 0.21862176433205605,
"rewards/improved_len_reward_dast": 0.5871463492512703,
"step": 325
},
{
"completion_length": 1697.7295837402344,
"epoch": 0.8451069345430978,
"grad_norm": 0.16375311514593216,
"kl": 0.00887298583984375,
"learning_rate": 1.630409843596216e-07,
"loss": 0.0106,
"reward": 0.5537277311086655,
"reward_std": 0.22276470810174942,
"rewards/improved_len_reward_dast": 0.5537277311086655,
"step": 326
},
{
"completion_length": 1679.7193298339844,
"epoch": 0.847699287103046,
"grad_norm": 0.17301892622927403,
"kl": 0.0079193115234375,
"learning_rate": 1.609713332401831e-07,
"loss": 0.0271,
"reward": 0.5200591683387756,
"reward_std": 0.24121804535388947,
"rewards/improved_len_reward_dast": 0.5200591683387756,
"step": 327
},
{
"completion_length": 1716.8316345214844,
"epoch": 0.8502916396629941,
"grad_norm": 0.21625142796960517,
"kl": 0.007904052734375,
"learning_rate": 1.5893375413002765e-07,
"loss": -0.0081,
"reward": 0.3760797679424286,
"reward_std": 0.3030256852507591,
"rewards/improved_len_reward_dast": 0.3760797679424286,
"step": 328
},
{
"completion_length": 2087.734649658203,
"epoch": 0.8528839922229423,
"grad_norm": 0.16651549154328052,
"kl": 0.0118865966796875,
"learning_rate": 1.569284150097226e-07,
"loss": 0.0193,
"reward": 0.526580810546875,
"reward_std": 0.20104971155524254,
"rewards/improved_len_reward_dast": 0.526580810546875,
"step": 329
},
{
"completion_length": 1818.8520202636719,
"epoch": 0.8554763447828905,
"grad_norm": 0.18294456751416227,
"kl": 0.0110626220703125,
"learning_rate": 1.5495548120193003e-07,
"loss": -0.0005,
"reward": 0.6227145195007324,
"reward_std": 0.2254919856786728,
"rewards/improved_len_reward_dast": 0.6227145195007324,
"step": 330
},
{
"completion_length": 1455.596939086914,
"epoch": 0.8580686973428386,
"grad_norm": 0.18443385066386403,
"kl": 0.006561279296875,
"learning_rate": 1.5301511535777784e-07,
"loss": 0.0069,
"reward": 0.6691017746925354,
"reward_std": 0.2389560304582119,
"rewards/improved_len_reward_dast": 0.6691017746925354,
"step": 331
},
{
"completion_length": 1708.9540710449219,
"epoch": 0.8606610499027868,
"grad_norm": 0.22349331481770487,
"kl": 0.01061248779296875,
"learning_rate": 1.5110747744345006e-07,
"loss": 0.0166,
"reward": 0.5234609097242355,
"reward_std": 0.22545504197478294,
"rewards/improved_len_reward_dast": 0.5234609097242355,
"step": 332
},
{
"completion_length": 1791.2856750488281,
"epoch": 0.863253402462735,
"grad_norm": 0.20595620886423197,
"kl": 0.0110321044921875,
"learning_rate": 1.4923272472699986e-07,
"loss": 0.0132,
"reward": 0.4878820851445198,
"reward_std": 0.19468558579683304,
"rewards/improved_len_reward_dast": 0.4878820851445198,
"step": 333
},
{
"completion_length": 1111.2040557861328,
"epoch": 0.8658457550226831,
"grad_norm": 0.18776891622154324,
"kl": 0.00612640380859375,
"learning_rate": 1.4739101176538274e-07,
"loss": 0.0092,
"reward": 0.4363629147410393,
"reward_std": 0.23635073751211166,
"rewards/improved_len_reward_dast": 0.4363629147410393,
"step": 334
},
{
"completion_length": 1884.7958984375,
"epoch": 0.8684381075826313,
"grad_norm": 0.2336826524549116,
"kl": 0.0115203857421875,
"learning_rate": 1.4558249039171639e-07,
"loss": 0.0514,
"reward": 0.5176705569028854,
"reward_std": 0.24876829609274864,
"rewards/improved_len_reward_dast": 0.5176705569028854,
"step": 335
},
{
"completion_length": 2040.5509643554688,
"epoch": 0.8710304601425793,
"grad_norm": 0.218416613251664,
"kl": 0.0112457275390625,
"learning_rate": 1.4380730970276195e-07,
"loss": 0.0374,
"reward": 0.4303254596889019,
"reward_std": 0.22433782927691936,
"rewards/improved_len_reward_dast": 0.4303254596889019,
"step": 336
},
{
"completion_length": 2323.64794921875,
"epoch": 0.8736228127025275,
"grad_norm": 0.21230824911936896,
"kl": 0.015228271484375,
"learning_rate": 1.420656160466333e-07,
"loss": 0.0156,
"reward": 0.37112269178032875,
"reward_std": 0.22541575506329536,
"rewards/improved_len_reward_dast": 0.37112269178032875,
"step": 337
},
{
"completion_length": 1894.1683044433594,
"epoch": 0.8762151652624757,
"grad_norm": 0.229570039046522,
"kl": 0.0124053955078125,
"learning_rate": 1.4035755301073102e-07,
"loss": 0.0045,
"reward": 0.5252515897154808,
"reward_std": 0.2364770919084549,
"rewards/improved_len_reward_dast": 0.5252515897154808,
"step": 338
},
{
"completion_length": 1622.8418273925781,
"epoch": 0.8788075178224238,
"grad_norm": 0.1958204168185744,
"kl": 0.01035308837890625,
"learning_rate": 1.386832614099056e-07,
"loss": 0.0081,
"reward": 0.5822550505399704,
"reward_std": 0.21591638028621674,
"rewards/improved_len_reward_dast": 0.5822550505399704,
"step": 339
},
{
"completion_length": 1524.19384765625,
"epoch": 0.881399870382372,
"grad_norm": 0.18334226293404582,
"kl": 0.0096435546875,
"learning_rate": 1.3704287927484846e-07,
"loss": 0.0124,
"reward": 0.45736076682806015,
"reward_std": 0.26574842631816864,
"rewards/improved_len_reward_dast": 0.45736076682806015,
"step": 340
},
{
"completion_length": 1764.8775329589844,
"epoch": 0.8839922229423202,
"grad_norm": 0.1983658544116603,
"kl": 0.01029205322265625,
"learning_rate": 1.3543654184071186e-07,
"loss": -0.0056,
"reward": 0.5266754031181335,
"reward_std": 0.19769595563411713,
"rewards/improved_len_reward_dast": 0.5266754031181335,
"step": 341
},
{
"completion_length": 1573.7908172607422,
"epoch": 0.8865845755022683,
"grad_norm": 0.20470905009725737,
"kl": 0.00878143310546875,
"learning_rate": 1.3386438153596067e-07,
"loss": 0.0079,
"reward": 0.45632604509592056,
"reward_std": 0.27837061509490013,
"rewards/improved_len_reward_dast": 0.45632604509592056,
"step": 342
},
{
"completion_length": 1546.9642639160156,
"epoch": 0.8891769280622165,
"grad_norm": 0.195646608080127,
"kl": 0.00881195068359375,
"learning_rate": 1.323265279714543e-07,
"loss": -0.0159,
"reward": 0.47241977229714394,
"reward_std": 0.20328497141599655,
"rewards/improved_len_reward_dast": 0.47241977229714394,
"step": 343
},
{
"completion_length": 1638.3571166992188,
"epoch": 0.8917692806221647,
"grad_norm": 0.19710744576953143,
"kl": 0.0110015869140625,
"learning_rate": 1.3082310792976202e-07,
"loss": 0.0262,
"reward": 0.5500081032514572,
"reward_std": 0.2118955608457327,
"rewards/improved_len_reward_dast": 0.5500081032514572,
"step": 344
},
{
"completion_length": 1797.3570861816406,
"epoch": 0.8943616331821128,
"grad_norm": 0.23174034375477884,
"kl": 0.01033782958984375,
"learning_rate": 1.293542453547102e-07,
"loss": 0.0376,
"reward": 0.5329776927828789,
"reward_std": 0.26193203777074814,
"rewards/improved_len_reward_dast": 0.5329776927828789,
"step": 345
},
{
"completion_length": 1538.1580810546875,
"epoch": 0.8969539857420609,
"grad_norm": 0.1904135528888599,
"kl": 0.00870513916015625,
"learning_rate": 1.279200613411642e-07,
"loss": 0.0264,
"reward": 0.5394655913114548,
"reward_std": 0.24537776410579681,
"rewards/improved_len_reward_dast": 0.5394655913114548,
"step": 346
},
{
"completion_length": 2166.448944091797,
"epoch": 0.899546338302009,
"grad_norm": 0.1557827805565501,
"kl": 0.0125579833984375,
"learning_rate": 1.2652067412504605e-07,
"loss": 0.0081,
"reward": 0.4106425456702709,
"reward_std": 0.30277248471975327,
"rewards/improved_len_reward_dast": 0.4106425456702709,
"step": 347
},
{
"completion_length": 1541.7806091308594,
"epoch": 0.9021386908619572,
"grad_norm": 0.19932118981267394,
"kl": 0.0081939697265625,
"learning_rate": 1.251561990735859e-07,
"loss": 0.0277,
"reward": 0.5063697546720505,
"reward_std": 0.24998274445533752,
"rewards/improved_len_reward_dast": 0.5063697546720505,
"step": 348
},
{
"completion_length": 2111.5306091308594,
"epoch": 0.9047310434219054,
"grad_norm": 0.2417651421099663,
"kl": 0.0139312744140625,
"learning_rate": 1.238267486758117e-07,
"loss": -0.0203,
"reward": 0.27164783608168364,
"reward_std": 0.22704457119107246,
"rewards/improved_len_reward_dast": 0.27164783608168364,
"step": 349
},
{
"completion_length": 1849.9336242675781,
"epoch": 0.9073233959818535,
"grad_norm": 0.21790965647021238,
"kl": 0.01129150390625,
"learning_rate": 1.2253243253327504e-07,
"loss": 0.0126,
"reward": 0.4877898320555687,
"reward_std": 0.2690836489200592,
"rewards/improved_len_reward_dast": 0.4877898320555687,
"step": 350
},
{
"completion_length": 1613.3826293945312,
"epoch": 0.9099157485418017,
"grad_norm": 0.20972635962388697,
"kl": 0.009246826171875,
"learning_rate": 1.212733573510154e-07,
"loss": 0.026,
"reward": 0.5494348630309105,
"reward_std": 0.2651122659444809,
"rewards/improved_len_reward_dast": 0.5494348630309105,
"step": 351
},
{
"completion_length": 1592.3264770507812,
"epoch": 0.9125081011017498,
"grad_norm": 0.21422445239057133,
"kl": 0.00946044921875,
"learning_rate": 1.20049626928764e-07,
"loss": 0.0288,
"reward": 0.5722446367144585,
"reward_std": 0.21782485768198967,
"rewards/improved_len_reward_dast": 0.5722446367144585,
"step": 352
},
{
"completion_length": 1722.6377258300781,
"epoch": 0.915100453661698,
"grad_norm": 0.19724359277224385,
"kl": 0.0107574462890625,
"learning_rate": 1.1886134215238539e-07,
"loss": 0.0022,
"reward": 0.6072470247745514,
"reward_std": 0.18419499322772026,
"rewards/improved_len_reward_dast": 0.6072470247745514,
"step": 353
},
{
"completion_length": 1321.903060913086,
"epoch": 0.9176928062216462,
"grad_norm": 0.18827903780162886,
"kl": 0.00800323486328125,
"learning_rate": 1.1770860098556122e-07,
"loss": 0.0036,
"reward": 0.6655057221651077,
"reward_std": 0.23036686331033707,
"rewards/improved_len_reward_dast": 0.6655057221651077,
"step": 354
},
{
"completion_length": 1697.489730834961,
"epoch": 0.9202851587815943,
"grad_norm": 0.18446628505764348,
"kl": 0.011688232421875,
"learning_rate": 1.1659149846171314e-07,
"loss": -0.0011,
"reward": 0.6077793091535568,
"reward_std": 0.24356402084231377,
"rewards/improved_len_reward_dast": 0.6077793091535568,
"step": 355
},
{
"completion_length": 1707.193862915039,
"epoch": 0.9228775113415425,
"grad_norm": 0.21847824906310198,
"kl": 0.0106964111328125,
"learning_rate": 1.1551012667616889e-07,
"loss": 0.0092,
"reward": 0.5661942809820175,
"reward_std": 0.20257538184523582,
"rewards/improved_len_reward_dast": 0.5661942809820175,
"step": 356
},
{
"completion_length": 2016.994873046875,
"epoch": 0.9254698639014906,
"grad_norm": 0.18270155740337932,
"kl": 0.0126800537109375,
"learning_rate": 1.1446457477856933e-07,
"loss": 0.0274,
"reward": 0.4170667566359043,
"reward_std": 0.2266511246562004,
"rewards/improved_len_reward_dast": 0.4170667566359043,
"step": 357
},
{
"completion_length": 1594.9795532226562,
"epoch": 0.9280622164614387,
"grad_norm": 0.22722827158120512,
"kl": 0.0077362060546875,
"learning_rate": 1.1345492896551908e-07,
"loss": 0.0393,
"reward": 0.553664393723011,
"reward_std": 0.31060051172971725,
"rewards/improved_len_reward_dast": 0.553664393723011,
"step": 358
},
{
"completion_length": 1834.0458679199219,
"epoch": 0.9306545690213869,
"grad_norm": 0.17537356402412543,
"kl": 0.00970458984375,
"learning_rate": 1.1248127247348025e-07,
"loss": 0.0211,
"reward": 0.5899290814995766,
"reward_std": 0.23436808586120605,
"rewards/improved_len_reward_dast": 0.5899290814995766,
"step": 359
},
{
"completion_length": 1781.6019744873047,
"epoch": 0.933246921581335,
"grad_norm": 0.18436444068948415,
"kl": 0.01174163818359375,
"learning_rate": 1.1154368557191032e-07,
"loss": 0.012,
"reward": 0.3762673009186983,
"reward_std": 0.2291586957871914,
"rewards/improved_len_reward_dast": 0.3762673009186983,
"step": 360
},
{
"completion_length": 1417.540771484375,
"epoch": 0.9358392741412832,
"grad_norm": 0.19623835654730662,
"kl": 0.0077362060546875,
"learning_rate": 1.1064224555664489e-07,
"loss": 0.0005,
"reward": 0.4894239827990532,
"reward_std": 0.2442505694925785,
"rewards/improved_len_reward_dast": 0.4894239827990532,
"step": 361
},
{
"completion_length": 1666.7703552246094,
"epoch": 0.9384316267012314,
"grad_norm": 0.17894551807778217,
"kl": 0.00939178466796875,
"learning_rate": 1.0977702674352485e-07,
"loss": 0.0265,
"reward": 0.607224777340889,
"reward_std": 0.18787994422018528,
"rewards/improved_len_reward_dast": 0.607224777340889,
"step": 362
},
{
"completion_length": 1492.4081420898438,
"epoch": 0.9410239792611795,
"grad_norm": 0.2507376552650098,
"kl": 0.00975799560546875,
"learning_rate": 1.0894810046227007e-07,
"loss": 0.0023,
"reward": 0.5297152251005173,
"reward_std": 0.23747341334819794,
"rewards/improved_len_reward_dast": 0.5297152251005173,
"step": 363
},
{
"completion_length": 1323.10205078125,
"epoch": 0.9436163318211277,
"grad_norm": 0.16318792733873802,
"kl": 0.006740570068359375,
"learning_rate": 1.0815553505059864e-07,
"loss": -0.0032,
"reward": 0.6097646132111549,
"reward_std": 0.1873321644961834,
"rewards/improved_len_reward_dast": 0.6097646132111549,
"step": 364
},
{
"completion_length": 1844.9795532226562,
"epoch": 0.9462086843810759,
"grad_norm": 0.2063060093669664,
"kl": 0.0108489990234375,
"learning_rate": 1.0739939584859327e-07,
"loss": 0.0645,
"reward": 0.5058535486459732,
"reward_std": 0.22045359015464783,
"rewards/improved_len_reward_dast": 0.5058535486459732,
"step": 365
},
{
"completion_length": 2252.352020263672,
"epoch": 0.948801036941024,
"grad_norm": 0.20816769237505958,
"kl": 0.0121002197265625,
"learning_rate": 1.066797451933144e-07,
"loss": 0.0024,
"reward": 0.340947512537241,
"reward_std": 0.3400820717215538,
"rewards/improved_len_reward_dast": 0.340947512537241,
"step": 366
},
{
"completion_length": 1632.3060607910156,
"epoch": 0.9513933895009722,
"grad_norm": 0.18459167765340137,
"kl": 0.009857177734375,
"learning_rate": 1.0599664241366108e-07,
"loss": 0.0108,
"reward": 0.5263752043247223,
"reward_std": 0.2795609086751938,
"rewards/improved_len_reward_dast": 0.5263752043247223,
"step": 367
},
{
"completion_length": 1941.2754516601562,
"epoch": 0.9539857420609202,
"grad_norm": 0.19606646732184554,
"kl": 0.0090179443359375,
"learning_rate": 1.0535014382547976e-07,
"loss": 0.0404,
"reward": 0.4571000598371029,
"reward_std": 0.33078011497855186,
"rewards/improved_len_reward_dast": 0.4571000598371029,
"step": 368
},
{
"completion_length": 1763.602035522461,
"epoch": 0.9565780946208684,
"grad_norm": 0.16415935179120342,
"kl": 0.010005950927734375,
"learning_rate": 1.0474030272692176e-07,
"loss": 0.0194,
"reward": 0.46546463668346405,
"reward_std": 0.2402571141719818,
"rewards/improved_len_reward_dast": 0.46546463668346405,
"step": 369
},
{
"completion_length": 1787.3367309570312,
"epoch": 0.9591704471808166,
"grad_norm": 0.18183476066972562,
"kl": 0.0098114013671875,
"learning_rate": 1.0416716939404906e-07,
"loss": 0.0247,
"reward": 0.5943343639373779,
"reward_std": 0.25974351167678833,
"rewards/improved_len_reward_dast": 0.5943343639373779,
"step": 370
},
{
"completion_length": 2110.2244567871094,
"epoch": 0.9617627997407647,
"grad_norm": 0.1753149156601552,
"kl": 0.0126953125,
"learning_rate": 1.0363079107668965e-07,
"loss": 0.028,
"reward": 0.42196690291166306,
"reward_std": 0.3020992539823055,
"rewards/improved_len_reward_dast": 0.42196690291166306,
"step": 371
},
{
"completion_length": 1868.1173095703125,
"epoch": 0.9643551523007129,
"grad_norm": 0.188995256537594,
"kl": 0.0104827880859375,
"learning_rate": 1.03131211994542e-07,
"loss": -0.0159,
"reward": 0.34801803156733513,
"reward_std": 0.30042145401239395,
"rewards/improved_len_reward_dast": 0.34801803156733513,
"step": 372
},
{
"completion_length": 1634.489761352539,
"epoch": 0.9669475048606611,
"grad_norm": 0.16806202813706952,
"kl": 0.0088043212890625,
"learning_rate": 1.0266847333352986e-07,
"loss": 0.0054,
"reward": 0.4557268023490906,
"reward_std": 0.24389904364943504,
"rewards/improved_len_reward_dast": 0.4557268023490906,
"step": 373
},
{
"completion_length": 1552.938720703125,
"epoch": 0.9695398574206092,
"grad_norm": 0.18394008200083298,
"kl": 0.0112762451171875,
"learning_rate": 1.022426132424064e-07,
"loss": 0.0133,
"reward": 0.47902625799179077,
"reward_std": 0.2295891009271145,
"rewards/improved_len_reward_dast": 0.47902625799179077,
"step": 374
},
{
"completion_length": 1685.0968933105469,
"epoch": 0.9721322099805574,
"grad_norm": 0.1713793991991847,
"kl": 0.00827789306640625,
"learning_rate": 1.0185366682960968e-07,
"loss": 0.0309,
"reward": 0.5218155384063721,
"reward_std": 0.23021429032087326,
"rewards/improved_len_reward_dast": 0.5218155384063721,
"step": 375
},
{
"completion_length": 1745.448974609375,
"epoch": 0.9747245625405055,
"grad_norm": 0.17042694708491835,
"kl": 0.01142120361328125,
"learning_rate": 1.015016661603677e-07,
"loss": 0.005,
"reward": 0.4762755334377289,
"reward_std": 0.17237477749586105,
"rewards/improved_len_reward_dast": 0.4762755334377289,
"step": 376
},
{
"completion_length": 1860.1581420898438,
"epoch": 0.9773169151004537,
"grad_norm": 0.196878907070725,
"kl": 0.01052093505859375,
"learning_rate": 1.011866402540555e-07,
"loss": 0.0255,
"reward": 0.4570060186088085,
"reward_std": 0.2744992598891258,
"rewards/improved_len_reward_dast": 0.4570060186088085,
"step": 377
},
{
"completion_length": 1672.1224365234375,
"epoch": 0.9799092676604018,
"grad_norm": 0.18295738763893107,
"kl": 0.00994110107421875,
"learning_rate": 1.0090861508180229e-07,
"loss": 0.0278,
"reward": 0.5498954951763153,
"reward_std": 0.205208458006382,
"rewards/improved_len_reward_dast": 0.5498954951763153,
"step": 378
},
{
"completion_length": 1366.7295379638672,
"epoch": 0.9825016202203499,
"grad_norm": 0.19995975950807063,
"kl": 0.00894927978515625,
"learning_rate": 1.006676135643506e-07,
"loss": 0.0182,
"reward": 0.6602616906166077,
"reward_std": 0.18208089470863342,
"rewards/improved_len_reward_dast": 0.6602616906166077,
"step": 379
},
{
"completion_length": 1685.9489440917969,
"epoch": 0.9850939727802981,
"grad_norm": 0.2671604672472017,
"kl": 0.0095367431640625,
"learning_rate": 1.004636555701666e-07,
"loss": 0.0317,
"reward": 0.5187881141901016,
"reward_std": 0.23186353966593742,
"rewards/improved_len_reward_dast": 0.5187881141901016,
"step": 380
},
{
"completion_length": 2225.1122436523438,
"epoch": 0.9876863253402463,
"grad_norm": 0.20851974707971935,
"kl": 0.0135955810546875,
"learning_rate": 1.0029675791380211e-07,
"loss": 0.0393,
"reward": 0.5472966581583023,
"reward_std": 0.22922645136713982,
"rewards/improved_len_reward_dast": 0.5472966581583023,
"step": 381
},
{
"completion_length": 1713.9336242675781,
"epoch": 0.9902786779001944,
"grad_norm": 0.19490938929994006,
"kl": 0.0126800537109375,
"learning_rate": 1.0016693435450846e-07,
"loss": 0.0017,
"reward": 0.392940990626812,
"reward_std": 0.2464723214507103,
"rewards/improved_len_reward_dast": 0.392940990626812,
"step": 382
},
{
"completion_length": 1858.6632385253906,
"epoch": 0.9928710304601426,
"grad_norm": 0.1901299895408558,
"kl": 0.00988006591796875,
"learning_rate": 1.00074195595102e-07,
"loss": 0.0158,
"reward": 0.4689394012093544,
"reward_std": 0.21242598444223404,
"rewards/improved_len_reward_dast": 0.4689394012093544,
"step": 383
},
{
"completion_length": 1422.4081573486328,
"epoch": 0.9954633830200907,
"grad_norm": 0.24495186709668992,
"kl": 0.009395599365234375,
"learning_rate": 1.0001854928108199e-07,
"loss": 0.0507,
"reward": 0.5447990372776985,
"reward_std": 0.21043004095554352,
"rewards/improved_len_reward_dast": 0.5447990372776985,
"step": 384
},
{
"completion_length": 1555.7091674804688,
"epoch": 0.9980557355800389,
"grad_norm": 0.21071152676099067,
"kl": 0.00982666015625,
"learning_rate": 1e-07,
"loss": 0.0156,
"reward": 0.4512558877468109,
"reward_std": 0.2603309191763401,
"rewards/improved_len_reward_dast": 0.4512558877468109,
"step": 385
},
{
"epoch": 0.9980557355800389,
"step": 385,
"total_flos": 0.0,
"train_loss": 0.01745116442015588,
"train_runtime": 68533.3834,
"train_samples_per_second": 0.158,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 385,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 7,
"trial_name": null,
"trial_params": null
}