|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9980557355800389, |
|
"eval_steps": 500, |
|
"global_step": 385, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 1848.5458984375, |
|
"epoch": 0.002592352559948153, |
|
"grad_norm": 0.15340697800867806, |
|
"kl": 0.0, |
|
"learning_rate": 2.564102564102564e-08, |
|
"loss": 0.0187, |
|
"reward": 0.5978657901287079, |
|
"reward_std": 0.2931807413697243, |
|
"rewards/improved_len_reward_dast": 0.5978657901287079, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 2130.4540100097656, |
|
"epoch": 0.005184705119896306, |
|
"grad_norm": 0.19427816311387766, |
|
"kl": 0.0, |
|
"learning_rate": 5.128205128205128e-08, |
|
"loss": 0.0499, |
|
"reward": 0.3565452806651592, |
|
"reward_std": 0.20069235190749168, |
|
"rewards/improved_len_reward_dast": 0.3565452806651592, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 2029.7754821777344, |
|
"epoch": 0.007777057679844459, |
|
"grad_norm": 0.15482062458259815, |
|
"kl": 0.00014066696166992188, |
|
"learning_rate": 7.692307692307692e-08, |
|
"loss": 0.0029, |
|
"reward": 0.36762307211756706, |
|
"reward_std": 0.26278356462717056, |
|
"rewards/improved_len_reward_dast": 0.36762307211756706, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 2113.341796875, |
|
"epoch": 0.010369410239792612, |
|
"grad_norm": 0.14048722076875736, |
|
"kl": 0.00012814998626708984, |
|
"learning_rate": 1.0256410256410256e-07, |
|
"loss": -0.0221, |
|
"reward": 0.48202627897262573, |
|
"reward_std": 0.3225458636879921, |
|
"rewards/improved_len_reward_dast": 0.48202627897262573, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 1881.0356750488281, |
|
"epoch": 0.012961762799740765, |
|
"grad_norm": 0.15612866238173184, |
|
"kl": 0.00012302398681640625, |
|
"learning_rate": 1.2820512820512818e-07, |
|
"loss": -0.0027, |
|
"reward": 0.52305668592453, |
|
"reward_std": 0.21705364808440208, |
|
"rewards/improved_len_reward_dast": 0.52305668592453, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 2154.0663146972656, |
|
"epoch": 0.015554115359688918, |
|
"grad_norm": 0.15018629000790815, |
|
"kl": 0.00014281272888183594, |
|
"learning_rate": 1.5384615384615385e-07, |
|
"loss": -0.0153, |
|
"reward": 0.39995063841342926, |
|
"reward_std": 0.2720435969531536, |
|
"rewards/improved_len_reward_dast": 0.39995063841342926, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 1875.1071472167969, |
|
"epoch": 0.01814646791963707, |
|
"grad_norm": 0.17127136002254692, |
|
"kl": 0.0001201629638671875, |
|
"learning_rate": 1.7948717948717948e-07, |
|
"loss": 0.0249, |
|
"reward": 0.3945396225899458, |
|
"reward_std": 0.25139790773391724, |
|
"rewards/improved_len_reward_dast": 0.3945396225899458, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 1811.3316040039062, |
|
"epoch": 0.020738820479585224, |
|
"grad_norm": 0.17118361086335854, |
|
"kl": 0.00010824203491210938, |
|
"learning_rate": 2.0512820512820512e-07, |
|
"loss": -0.0186, |
|
"reward": 0.4574318379163742, |
|
"reward_std": 0.2575865164399147, |
|
"rewards/improved_len_reward_dast": 0.4574318379163742, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 2194.7601928710938, |
|
"epoch": 0.023331173039533377, |
|
"grad_norm": 0.16326988555433258, |
|
"kl": 0.00013935565948486328, |
|
"learning_rate": 2.3076923076923078e-07, |
|
"loss": 0.0077, |
|
"reward": 0.34908392280340195, |
|
"reward_std": 0.2770259566605091, |
|
"rewards/improved_len_reward_dast": 0.34908392280340195, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 1998.94384765625, |
|
"epoch": 0.02592352559948153, |
|
"grad_norm": 0.170741007261295, |
|
"kl": 0.00012493133544921875, |
|
"learning_rate": 2.5641025641025636e-07, |
|
"loss": 0.0192, |
|
"reward": 0.4179102033376694, |
|
"reward_std": 0.26221491396427155, |
|
"rewards/improved_len_reward_dast": 0.4179102033376694, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 2219.8724365234375, |
|
"epoch": 0.028515878159429683, |
|
"grad_norm": 0.17546980641506144, |
|
"kl": 0.0001461505889892578, |
|
"learning_rate": 2.8205128205128203e-07, |
|
"loss": 0.0316, |
|
"reward": 0.29684413131326437, |
|
"reward_std": 0.31361983716487885, |
|
"rewards/improved_len_reward_dast": 0.29684413131326437, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 2014.03564453125, |
|
"epoch": 0.031108230719377836, |
|
"grad_norm": 0.1541338261942444, |
|
"kl": 0.00012749433517456055, |
|
"learning_rate": 3.076923076923077e-07, |
|
"loss": -0.0135, |
|
"reward": 0.32144954474642873, |
|
"reward_std": 0.30298536643385887, |
|
"rewards/improved_len_reward_dast": 0.32144954474642873, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 2028.5969543457031, |
|
"epoch": 0.033700583279325985, |
|
"grad_norm": 0.19174062151927665, |
|
"kl": 0.00013196468353271484, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 0.0178, |
|
"reward": 0.31833722069859505, |
|
"reward_std": 0.25612180307507515, |
|
"rewards/improved_len_reward_dast": 0.31833722069859505, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 2096.3060607910156, |
|
"epoch": 0.03629293583927414, |
|
"grad_norm": 0.14669329107868662, |
|
"kl": 0.00011658668518066406, |
|
"learning_rate": 3.5897435897435896e-07, |
|
"loss": 0.0049, |
|
"reward": 0.4347623288631439, |
|
"reward_std": 0.21591071039438248, |
|
"rewards/improved_len_reward_dast": 0.4347623288631439, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 1500.5254821777344, |
|
"epoch": 0.03888528839922229, |
|
"grad_norm": 0.17776360380978226, |
|
"kl": 7.94529914855957e-05, |
|
"learning_rate": 3.8461538461538463e-07, |
|
"loss": 0.0382, |
|
"reward": 0.4248454347252846, |
|
"reward_std": 0.2069440335035324, |
|
"rewards/improved_len_reward_dast": 0.4248454347252846, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 1272.0509948730469, |
|
"epoch": 0.04147764095917045, |
|
"grad_norm": 0.1784145917886179, |
|
"kl": 9.012222290039062e-05, |
|
"learning_rate": 4.1025641025641024e-07, |
|
"loss": 0.0089, |
|
"reward": 0.5397656932473183, |
|
"reward_std": 0.2598051242530346, |
|
"rewards/improved_len_reward_dast": 0.5397656932473183, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 2013.8724365234375, |
|
"epoch": 0.0440699935191186, |
|
"grad_norm": 0.1907346351579617, |
|
"kl": 0.0001175999641418457, |
|
"learning_rate": 4.358974358974359e-07, |
|
"loss": 0.0611, |
|
"reward": 0.3499421738088131, |
|
"reward_std": 0.33140237629413605, |
|
"rewards/improved_len_reward_dast": 0.3499421738088131, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 1330.7244567871094, |
|
"epoch": 0.046662346079066754, |
|
"grad_norm": 0.2178582705000707, |
|
"kl": 7.390975952148438e-05, |
|
"learning_rate": 4.6153846153846156e-07, |
|
"loss": 0.0745, |
|
"reward": 0.380832314491272, |
|
"reward_std": 0.2622619494795799, |
|
"rewards/improved_len_reward_dast": 0.380832314491272, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 1699.9285888671875, |
|
"epoch": 0.0492546986390149, |
|
"grad_norm": 0.19343172432969366, |
|
"kl": 0.0001125335693359375, |
|
"learning_rate": 4.871794871794871e-07, |
|
"loss": 0.0827, |
|
"reward": 0.42683304101228714, |
|
"reward_std": 0.3005821108818054, |
|
"rewards/improved_len_reward_dast": 0.42683304101228714, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 1751.9743957519531, |
|
"epoch": 0.05184705119896306, |
|
"grad_norm": 0.16168658590608048, |
|
"kl": 0.00012385845184326172, |
|
"learning_rate": 5.128205128205127e-07, |
|
"loss": -0.0121, |
|
"reward": 0.2530975602567196, |
|
"reward_std": 0.38571304827928543, |
|
"rewards/improved_len_reward_dast": 0.2530975602567196, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 2237.551025390625, |
|
"epoch": 0.05443940375891121, |
|
"grad_norm": 0.1689710162473967, |
|
"kl": 0.00014531612396240234, |
|
"learning_rate": 5.384615384615384e-07, |
|
"loss": 0.0204, |
|
"reward": 0.2948920242488384, |
|
"reward_std": 0.298846572637558, |
|
"rewards/improved_len_reward_dast": 0.2948920242488384, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 1934.6530151367188, |
|
"epoch": 0.057031756318859365, |
|
"grad_norm": 0.19901186477459418, |
|
"kl": 0.00010859966278076172, |
|
"learning_rate": 5.641025641025641e-07, |
|
"loss": 0.0711, |
|
"reward": 0.3974427357316017, |
|
"reward_std": 0.32176483422517776, |
|
"rewards/improved_len_reward_dast": 0.3974427357316017, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 1771.6071166992188, |
|
"epoch": 0.059624108878807515, |
|
"grad_norm": 0.15341330162436806, |
|
"kl": 9.715557098388672e-05, |
|
"learning_rate": 5.897435897435898e-07, |
|
"loss": 0.0068, |
|
"reward": 0.5254772454500198, |
|
"reward_std": 0.21970795094966888, |
|
"rewards/improved_len_reward_dast": 0.5254772454500198, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 1626.7601623535156, |
|
"epoch": 0.06221646143875567, |
|
"grad_norm": 0.17123059985482156, |
|
"kl": 0.00012636184692382812, |
|
"learning_rate": 6.153846153846154e-07, |
|
"loss": 0.0005, |
|
"reward": 0.37653250247240067, |
|
"reward_std": 0.3122313618659973, |
|
"rewards/improved_len_reward_dast": 0.37653250247240067, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 2203.3570861816406, |
|
"epoch": 0.06480881399870382, |
|
"grad_norm": 0.14530311020475567, |
|
"kl": 0.00015342235565185547, |
|
"learning_rate": 6.410256410256411e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3898318260908127, |
|
"reward_std": 0.2564953900873661, |
|
"rewards/improved_len_reward_dast": 0.3898318260908127, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 1814.1887664794922, |
|
"epoch": 0.06740116655865197, |
|
"grad_norm": 0.16879506578340142, |
|
"kl": 0.00012159347534179688, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 0.0144, |
|
"reward": 0.41976478695869446, |
|
"reward_std": 0.28917887061834335, |
|
"rewards/improved_len_reward_dast": 0.41976478695869446, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 1796.790771484375, |
|
"epoch": 0.06999351911860013, |
|
"grad_norm": 0.1906922337651447, |
|
"kl": 0.0001188516616821289, |
|
"learning_rate": 6.923076923076922e-07, |
|
"loss": 0.0678, |
|
"reward": 0.4590848907828331, |
|
"reward_std": 0.2338687926530838, |
|
"rewards/improved_len_reward_dast": 0.4590848907828331, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 1638.7346801757812, |
|
"epoch": 0.07258587167854828, |
|
"grad_norm": 0.19116566970185705, |
|
"kl": 0.00011014938354492188, |
|
"learning_rate": 7.179487179487179e-07, |
|
"loss": 0.0404, |
|
"reward": 0.41585223004221916, |
|
"reward_std": 0.2356470599770546, |
|
"rewards/improved_len_reward_dast": 0.41585223004221916, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 1974.6121826171875, |
|
"epoch": 0.07517822423849643, |
|
"grad_norm": 0.16589370632623032, |
|
"kl": 0.0001277923583984375, |
|
"learning_rate": 7.435897435897435e-07, |
|
"loss": -0.0039, |
|
"reward": 0.4259794130921364, |
|
"reward_std": 0.2807146720588207, |
|
"rewards/improved_len_reward_dast": 0.4259794130921364, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 1697.2346496582031, |
|
"epoch": 0.07777057679844458, |
|
"grad_norm": 0.15079867785526088, |
|
"kl": 9.143352508544922e-05, |
|
"learning_rate": 7.692307692307693e-07, |
|
"loss": -0.034, |
|
"reward": 0.29869329556822777, |
|
"reward_std": 0.2533705197274685, |
|
"rewards/improved_len_reward_dast": 0.29869329556822777, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2224.2754516601562, |
|
"epoch": 0.08036292935839275, |
|
"grad_norm": 0.15714058382235824, |
|
"kl": 0.00016355514526367188, |
|
"learning_rate": 7.948717948717948e-07, |
|
"loss": 0.0299, |
|
"reward": 0.5116054937243462, |
|
"reward_std": 0.26772793754935265, |
|
"rewards/improved_len_reward_dast": 0.5116054937243462, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 1872.5101928710938, |
|
"epoch": 0.0829552819183409, |
|
"grad_norm": 0.17779038577227782, |
|
"kl": 0.00011873245239257812, |
|
"learning_rate": 8.205128205128205e-07, |
|
"loss": 0.041, |
|
"reward": 0.3215858917683363, |
|
"reward_std": 0.27255750447511673, |
|
"rewards/improved_len_reward_dast": 0.3215858917683363, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 1810.0560913085938, |
|
"epoch": 0.08554763447828904, |
|
"grad_norm": 0.18730102696321338, |
|
"kl": 0.00012505054473876953, |
|
"learning_rate": 8.461538461538461e-07, |
|
"loss": 0.0661, |
|
"reward": 0.483148779720068, |
|
"reward_std": 0.292447779327631, |
|
"rewards/improved_len_reward_dast": 0.483148779720068, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 2044.69384765625, |
|
"epoch": 0.0881399870382372, |
|
"grad_norm": 0.19212723817368654, |
|
"kl": 0.00014781951904296875, |
|
"learning_rate": 8.717948717948718e-07, |
|
"loss": 0.0301, |
|
"reward": 0.3396348973037675, |
|
"reward_std": 0.3017418198287487, |
|
"rewards/improved_len_reward_dast": 0.3396348973037675, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 2474.2040405273438, |
|
"epoch": 0.09073233959818536, |
|
"grad_norm": 0.14515104577961038, |
|
"kl": 0.0001646280288696289, |
|
"learning_rate": 8.974358974358974e-07, |
|
"loss": 0.0022, |
|
"reward": 0.21619121730327606, |
|
"reward_std": 0.25060467794537544, |
|
"rewards/improved_len_reward_dast": 0.21619121730327606, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 2244.591827392578, |
|
"epoch": 0.09332469215813351, |
|
"grad_norm": 0.1551542493705291, |
|
"kl": 0.00016069412231445312, |
|
"learning_rate": 9.230769230769231e-07, |
|
"loss": 0.0396, |
|
"reward": 0.4676624909043312, |
|
"reward_std": 0.24170640110969543, |
|
"rewards/improved_len_reward_dast": 0.4676624909043312, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 2032.0714111328125, |
|
"epoch": 0.09591704471808166, |
|
"grad_norm": 0.17617571435946433, |
|
"kl": 0.00017404556274414062, |
|
"learning_rate": 9.487179487179486e-07, |
|
"loss": 0.0694, |
|
"reward": 0.5147057101130486, |
|
"reward_std": 0.25003863498568535, |
|
"rewards/improved_len_reward_dast": 0.5147057101130486, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 1702.9744262695312, |
|
"epoch": 0.0985093972780298, |
|
"grad_norm": 0.17747466302258563, |
|
"kl": 0.00012755393981933594, |
|
"learning_rate": 9.743589743589742e-07, |
|
"loss": 0.0511, |
|
"reward": 0.5134269595146179, |
|
"reward_std": 0.25962407886981964, |
|
"rewards/improved_len_reward_dast": 0.5134269595146179, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 2443.3265075683594, |
|
"epoch": 0.10110174983797797, |
|
"grad_norm": 0.1545853869841857, |
|
"kl": 0.00020003318786621094, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0057, |
|
"reward": 0.1663860222324729, |
|
"reward_std": 0.2878994420170784, |
|
"rewards/improved_len_reward_dast": 0.1663860222324729, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 2536.2295532226562, |
|
"epoch": 0.10369410239792612, |
|
"grad_norm": 0.14472339566870965, |
|
"kl": 0.00019025802612304688, |
|
"learning_rate": 9.99981450718918e-07, |
|
"loss": 0.0107, |
|
"reward": 0.337845042347908, |
|
"reward_std": 0.2574784606695175, |
|
"rewards/improved_len_reward_dast": 0.337845042347908, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 1802.9540405273438, |
|
"epoch": 0.10628645495787427, |
|
"grad_norm": 0.18393179565451373, |
|
"kl": 0.0001709461212158203, |
|
"learning_rate": 9.99925804404898e-07, |
|
"loss": 0.0556, |
|
"reward": 0.2605009600520134, |
|
"reward_std": 0.3141431324183941, |
|
"rewards/improved_len_reward_dast": 0.2605009600520134, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 1914.5356750488281, |
|
"epoch": 0.10887880751782242, |
|
"grad_norm": 0.14563287705556602, |
|
"kl": 0.00016033649444580078, |
|
"learning_rate": 9.998330656454915e-07, |
|
"loss": -0.0016, |
|
"reward": 0.506085067987442, |
|
"reward_std": 0.28512752801179886, |
|
"rewards/improved_len_reward_dast": 0.506085067987442, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 1949.6172790527344, |
|
"epoch": 0.11147116007777058, |
|
"grad_norm": 0.1923555097878006, |
|
"kl": 0.0002301931381225586, |
|
"learning_rate": 9.99703242086198e-07, |
|
"loss": 0.0342, |
|
"reward": 0.3602943029254675, |
|
"reward_std": 0.25087232142686844, |
|
"rewards/improved_len_reward_dast": 0.3602943029254675, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 1847.9336242675781, |
|
"epoch": 0.11406351263771873, |
|
"grad_norm": 0.20544935031130743, |
|
"kl": 0.0001766681671142578, |
|
"learning_rate": 9.995363444298333e-07, |
|
"loss": 0.0184, |
|
"reward": 0.3888886272907257, |
|
"reward_std": 0.32428839057683945, |
|
"rewards/improved_len_reward_dast": 0.3888886272907257, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 2265.096923828125, |
|
"epoch": 0.11665586519766688, |
|
"grad_norm": 0.14292203845841508, |
|
"kl": 0.00018930435180664062, |
|
"learning_rate": 9.993323864356492e-07, |
|
"loss": 0.0017, |
|
"reward": 0.22207820555195212, |
|
"reward_std": 0.25323856994509697, |
|
"rewards/improved_len_reward_dast": 0.22207820555195212, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 2902.938720703125, |
|
"epoch": 0.11924821775761503, |
|
"grad_norm": 0.11292903909794, |
|
"kl": 0.00022602081298828125, |
|
"learning_rate": 9.990913849181977e-07, |
|
"loss": 0.009, |
|
"reward": 0.31735342741012573, |
|
"reward_std": 0.23353197798132896, |
|
"rewards/improved_len_reward_dast": 0.31735342741012573, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 1847.5662689208984, |
|
"epoch": 0.1218405703175632, |
|
"grad_norm": 0.16221094286352689, |
|
"kl": 0.00019168853759765625, |
|
"learning_rate": 9.988133597459444e-07, |
|
"loss": 0.0308, |
|
"reward": 0.41010551154613495, |
|
"reward_std": 0.26410190016031265, |
|
"rewards/improved_len_reward_dast": 0.41010551154613495, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 1890.3928527832031, |
|
"epoch": 0.12443292287751134, |
|
"grad_norm": 0.18056689939637202, |
|
"kl": 0.00018024444580078125, |
|
"learning_rate": 9.984983338396323e-07, |
|
"loss": 0.0602, |
|
"reward": 0.41444508731365204, |
|
"reward_std": 0.19691497087478638, |
|
"rewards/improved_len_reward_dast": 0.41444508731365204, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 1593.5561218261719, |
|
"epoch": 0.1270252754374595, |
|
"grad_norm": 0.175446467485199, |
|
"kl": 0.0002167224884033203, |
|
"learning_rate": 9.981463331703903e-07, |
|
"loss": 0.0348, |
|
"reward": 0.5070051103830338, |
|
"reward_std": 0.21338175982236862, |
|
"rewards/improved_len_reward_dast": 0.5070051103830338, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 1878.2346496582031, |
|
"epoch": 0.12961762799740764, |
|
"grad_norm": 0.19397023295879293, |
|
"kl": 0.00023925304412841797, |
|
"learning_rate": 9.977573867575937e-07, |
|
"loss": -0.0055, |
|
"reward": 0.3375612124800682, |
|
"reward_std": 0.27106014266610146, |
|
"rewards/improved_len_reward_dast": 0.3375612124800682, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 2093.2193756103516, |
|
"epoch": 0.1322099805573558, |
|
"grad_norm": 0.17897729131572065, |
|
"kl": 0.0003161430358886719, |
|
"learning_rate": 9.9733152666647e-07, |
|
"loss": 0.0406, |
|
"reward": 0.391084011644125, |
|
"reward_std": 0.2787310928106308, |
|
"rewards/improved_len_reward_dast": 0.391084011644125, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 2547.8162841796875, |
|
"epoch": 0.13480233311730394, |
|
"grad_norm": 0.21312563621246147, |
|
"kl": 0.0002579689025878906, |
|
"learning_rate": 9.968687880054579e-07, |
|
"loss": 0.0204, |
|
"reward": 0.4913594201207161, |
|
"reward_std": 0.22158093005418777, |
|
"rewards/improved_len_reward_dast": 0.4913594201207161, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 1667.8571166992188, |
|
"epoch": 0.1373946856772521, |
|
"grad_norm": 0.20488856136613437, |
|
"kl": 0.0003466606140136719, |
|
"learning_rate": 9.963692089233104e-07, |
|
"loss": 0.0199, |
|
"reward": 0.33586448058485985, |
|
"reward_std": 0.3006225787103176, |
|
"rewards/improved_len_reward_dast": 0.33586448058485985, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 1683.8571166992188, |
|
"epoch": 0.13998703823720027, |
|
"grad_norm": 0.16794693148171316, |
|
"kl": 0.0003898143768310547, |
|
"learning_rate": 9.958328306059508e-07, |
|
"loss": 0.0545, |
|
"reward": 0.40445420145988464, |
|
"reward_std": 0.2224370762705803, |
|
"rewards/improved_len_reward_dast": 0.40445420145988464, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 1890.1173400878906, |
|
"epoch": 0.1425793907971484, |
|
"grad_norm": 0.23753881445921649, |
|
"kl": 0.0004630088806152344, |
|
"learning_rate": 9.952596972730782e-07, |
|
"loss": 0.0328, |
|
"reward": 0.2540663415566087, |
|
"reward_std": 0.19413560815155506, |
|
"rewards/improved_len_reward_dast": 0.2540663415566087, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 1981.3163146972656, |
|
"epoch": 0.14517174335709657, |
|
"grad_norm": 0.17990321241121518, |
|
"kl": 0.0003933906555175781, |
|
"learning_rate": 9.946498561745201e-07, |
|
"loss": 0.0525, |
|
"reward": 0.43116573989391327, |
|
"reward_std": 0.27771833911538124, |
|
"rewards/improved_len_reward_dast": 0.43116573989391327, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 2039.1785583496094, |
|
"epoch": 0.14776409591704473, |
|
"grad_norm": 0.1846330840092854, |
|
"kl": 0.0004582405090332031, |
|
"learning_rate": 9.94003357586339e-07, |
|
"loss": 0.039, |
|
"reward": 0.4446622207760811, |
|
"reward_std": 0.27240753918886185, |
|
"rewards/improved_len_reward_dast": 0.4446622207760811, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 2208.244842529297, |
|
"epoch": 0.15035644847699287, |
|
"grad_norm": 0.18237281644603112, |
|
"kl": 0.00033092498779296875, |
|
"learning_rate": 9.933202548066855e-07, |
|
"loss": 0.019, |
|
"reward": 0.40539775788784027, |
|
"reward_std": 0.2241816557943821, |
|
"rewards/improved_len_reward_dast": 0.40539775788784027, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 2297.8978881835938, |
|
"epoch": 0.15294880103694103, |
|
"grad_norm": 0.14330971261584569, |
|
"kl": 0.00047016143798828125, |
|
"learning_rate": 9.926006041514068e-07, |
|
"loss": 0.0525, |
|
"reward": 0.42655882239341736, |
|
"reward_std": 0.31429572589695454, |
|
"rewards/improved_len_reward_dast": 0.42655882239341736, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 2013.0254821777344, |
|
"epoch": 0.15554115359688916, |
|
"grad_norm": 0.17629803487682835, |
|
"kl": 0.0005960464477539062, |
|
"learning_rate": 9.918444649494012e-07, |
|
"loss": 0.0608, |
|
"reward": 0.48641955107450485, |
|
"reward_std": 0.20150578767061234, |
|
"rewards/improved_len_reward_dast": 0.48641955107450485, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 2207.5408325195312, |
|
"epoch": 0.15813350615683733, |
|
"grad_norm": 0.15781143039008477, |
|
"kl": 0.0005693435668945312, |
|
"learning_rate": 9.9105189953773e-07, |
|
"loss": 0.0303, |
|
"reward": 0.4598369002342224, |
|
"reward_std": 0.27581632137298584, |
|
"rewards/improved_len_reward_dast": 0.4598369002342224, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 2301.9642639160156, |
|
"epoch": 0.1607258587167855, |
|
"grad_norm": 0.18692743277703477, |
|
"kl": 0.0006175041198730469, |
|
"learning_rate": 9.90222973256475e-07, |
|
"loss": 0.0776, |
|
"reward": 0.5364214852452278, |
|
"reward_std": 0.2540983334183693, |
|
"rewards/improved_len_reward_dast": 0.5364214852452278, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 2645.9795532226562, |
|
"epoch": 0.16331821127673363, |
|
"grad_norm": 0.1418610635211088, |
|
"kl": 0.0006427764892578125, |
|
"learning_rate": 9.89357754443355e-07, |
|
"loss": -0.0025, |
|
"reward": 0.34263312071561813, |
|
"reward_std": 0.18951043859124184, |
|
"rewards/improved_len_reward_dast": 0.34263312071561813, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 2645.8213500976562, |
|
"epoch": 0.1659105638366818, |
|
"grad_norm": 0.153618608876278, |
|
"kl": 0.0007190704345703125, |
|
"learning_rate": 9.884563144280897e-07, |
|
"loss": 0.0668, |
|
"reward": 0.38441576063632965, |
|
"reward_std": 0.23758746683597565, |
|
"rewards/improved_len_reward_dast": 0.38441576063632965, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 1942.7499389648438, |
|
"epoch": 0.16850291639662995, |
|
"grad_norm": 0.1677620586294242, |
|
"kl": 0.0009021759033203125, |
|
"learning_rate": 9.875187275265198e-07, |
|
"loss": 0.0045, |
|
"reward": 0.463797003030777, |
|
"reward_std": 0.22495094686746597, |
|
"rewards/improved_len_reward_dast": 0.463797003030777, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 2529.637664794922, |
|
"epoch": 0.1710952689565781, |
|
"grad_norm": 0.16610955001052316, |
|
"kl": 0.001155853271484375, |
|
"learning_rate": 9.865450710344807e-07, |
|
"loss": 0.0273, |
|
"reward": 0.28665875643491745, |
|
"reward_std": 0.2632176913321018, |
|
"rewards/improved_len_reward_dast": 0.28665875643491745, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 2206.5152587890625, |
|
"epoch": 0.17368762151652625, |
|
"grad_norm": 0.1800502801294064, |
|
"kl": 0.0015268325805664062, |
|
"learning_rate": 9.855354252214307e-07, |
|
"loss": 0.0485, |
|
"reward": 0.3745214883238077, |
|
"reward_std": 0.2947724014520645, |
|
"rewards/improved_len_reward_dast": 0.3745214883238077, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 1694.0305480957031, |
|
"epoch": 0.1762799740764744, |
|
"grad_norm": 0.17423141164815986, |
|
"kl": 0.0011110305786132812, |
|
"learning_rate": 9.844898733238311e-07, |
|
"loss": 0.0307, |
|
"reward": 0.5885476693511009, |
|
"reward_std": 0.23455755040049553, |
|
"rewards/improved_len_reward_dast": 0.5885476693511009, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 2007.4744873046875, |
|
"epoch": 0.17887232663642255, |
|
"grad_norm": 0.15896758825580698, |
|
"kl": 0.001338958740234375, |
|
"learning_rate": 9.83408501538287e-07, |
|
"loss": 0.0007, |
|
"reward": 0.31307457387447357, |
|
"reward_std": 0.28911132737994194, |
|
"rewards/improved_len_reward_dast": 0.31307457387447357, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 1773.3979187011719, |
|
"epoch": 0.18146467919637072, |
|
"grad_norm": 0.1796062383107639, |
|
"kl": 0.0017347335815429688, |
|
"learning_rate": 9.822913990144387e-07, |
|
"loss": -0.0399, |
|
"reward": 0.37001069262623787, |
|
"reward_std": 0.28749338537454605, |
|
"rewards/improved_len_reward_dast": 0.37001069262623787, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 2173.596893310547, |
|
"epoch": 0.18405703175631885, |
|
"grad_norm": 0.16114217103117812, |
|
"kl": 0.0013170242309570312, |
|
"learning_rate": 9.811386578476146e-07, |
|
"loss": 0.0026, |
|
"reward": 0.43062953650951385, |
|
"reward_std": 0.2803415507078171, |
|
"rewards/improved_len_reward_dast": 0.43062953650951385, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 1955.3214111328125, |
|
"epoch": 0.18664938431626701, |
|
"grad_norm": 0.19621793042471328, |
|
"kl": 0.0011892318725585938, |
|
"learning_rate": 9.79950373071236e-07, |
|
"loss": 0.0391, |
|
"reward": 0.4167153127491474, |
|
"reward_std": 0.2199762761592865, |
|
"rewards/improved_len_reward_dast": 0.4167153127491474, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 1949.1836242675781, |
|
"epoch": 0.18924173687621518, |
|
"grad_norm": 0.22914789827556842, |
|
"kl": 0.0016918182373046875, |
|
"learning_rate": 9.787266426489845e-07, |
|
"loss": 0.0899, |
|
"reward": 0.4319685846567154, |
|
"reward_std": 0.22768162935972214, |
|
"rewards/improved_len_reward_dast": 0.4319685846567154, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 2204.3724365234375, |
|
"epoch": 0.1918340894361633, |
|
"grad_norm": 0.1817187740768752, |
|
"kl": 0.0019664764404296875, |
|
"learning_rate": 9.77467567466725e-07, |
|
"loss": -0.0129, |
|
"reward": 0.32414303719997406, |
|
"reward_std": 0.31503428146243095, |
|
"rewards/improved_len_reward_dast": 0.32414303719997406, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 2346.0152282714844, |
|
"epoch": 0.19442644199611148, |
|
"grad_norm": 0.16770893885689953, |
|
"kl": 0.0021991729736328125, |
|
"learning_rate": 9.761732513241882e-07, |
|
"loss": 0.0258, |
|
"reward": 0.42492585629224777, |
|
"reward_std": 0.22089344635605812, |
|
"rewards/improved_len_reward_dast": 0.42492585629224777, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 2032.0764465332031, |
|
"epoch": 0.1970187945560596, |
|
"grad_norm": 0.22921042830992452, |
|
"kl": 0.00289154052734375, |
|
"learning_rate": 9.748438009264142e-07, |
|
"loss": 0.0577, |
|
"reward": 0.5482478961348534, |
|
"reward_std": 0.23817146569490433, |
|
"rewards/improved_len_reward_dast": 0.5482478961348534, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 2265.1071166992188, |
|
"epoch": 0.19961114711600778, |
|
"grad_norm": 0.16354175184269162, |
|
"kl": 0.002227783203125, |
|
"learning_rate": 9.734793258749538e-07, |
|
"loss": 0.0147, |
|
"reward": 0.4374894965440035, |
|
"reward_std": 0.20623359642922878, |
|
"rewards/improved_len_reward_dast": 0.4374894965440035, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 2333.576446533203, |
|
"epoch": 0.20220349967595594, |
|
"grad_norm": 0.16254218312259427, |
|
"kl": 0.002429962158203125, |
|
"learning_rate": 9.720799386588358e-07, |
|
"loss": -0.0152, |
|
"reward": 0.3343161977827549, |
|
"reward_std": 0.3130199611186981, |
|
"rewards/improved_len_reward_dast": 0.3343161977827549, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 2167.6376953125, |
|
"epoch": 0.20479585223590407, |
|
"grad_norm": 0.16592757190255583, |
|
"kl": 0.003253936767578125, |
|
"learning_rate": 9.706457546452898e-07, |
|
"loss": -0.0054, |
|
"reward": 0.33338295854628086, |
|
"reward_std": 0.19700353033840656, |
|
"rewards/improved_len_reward_dast": 0.33338295854628086, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 2714.066192626953, |
|
"epoch": 0.20738820479585224, |
|
"grad_norm": 0.1643380510367972, |
|
"kl": 0.00283050537109375, |
|
"learning_rate": 9.691768920702379e-07, |
|
"loss": 0.0534, |
|
"reward": 0.27776505425572395, |
|
"reward_std": 0.2255413942039013, |
|
"rewards/improved_len_reward_dast": 0.27776505425572395, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 2177.2754516601562, |
|
"epoch": 0.2099805573558004, |
|
"grad_norm": 0.17174078043546434, |
|
"kl": 0.002231597900390625, |
|
"learning_rate": 9.676734720285456e-07, |
|
"loss": 0.0014, |
|
"reward": 0.3496297672390938, |
|
"reward_std": 0.2598918229341507, |
|
"rewards/improved_len_reward_dast": 0.3496297672390938, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 2181.4642639160156, |
|
"epoch": 0.21257290991574854, |
|
"grad_norm": 0.1541322467727904, |
|
"kl": 0.0022487640380859375, |
|
"learning_rate": 9.661356184640394e-07, |
|
"loss": 0.0224, |
|
"reward": 0.5033976063132286, |
|
"reward_std": 0.22605930641293526, |
|
"rewards/improved_len_reward_dast": 0.5033976063132286, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 2148.6376953125, |
|
"epoch": 0.2151652624756967, |
|
"grad_norm": 0.24078433672821112, |
|
"kl": 0.003170013427734375, |
|
"learning_rate": 9.64563458159288e-07, |
|
"loss": 0.075, |
|
"reward": 0.4664422944188118, |
|
"reward_std": 0.3008403405547142, |
|
"rewards/improved_len_reward_dast": 0.4664422944188118, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 2446.83154296875, |
|
"epoch": 0.21775761503564484, |
|
"grad_norm": 0.23474448660395109, |
|
"kl": 0.002742767333984375, |
|
"learning_rate": 9.629571207251515e-07, |
|
"loss": 0.0716, |
|
"reward": 0.4225612059235573, |
|
"reward_std": 0.24569111317396164, |
|
"rewards/improved_len_reward_dast": 0.4225612059235573, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 2261.2294921875, |
|
"epoch": 0.220349967595593, |
|
"grad_norm": 0.18550775711299053, |
|
"kl": 0.00246429443359375, |
|
"learning_rate": 9.613167385900944e-07, |
|
"loss": -0.0098, |
|
"reward": 0.3237818730995059, |
|
"reward_std": 0.23490814864635468, |
|
"rewards/improved_len_reward_dast": 0.3237818730995059, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 2063.73974609375, |
|
"epoch": 0.22294232015554116, |
|
"grad_norm": 0.1851730490973125, |
|
"kl": 0.002269744873046875, |
|
"learning_rate": 9.59642446989269e-07, |
|
"loss": -0.0064, |
|
"reward": 0.42184900864958763, |
|
"reward_std": 0.30175674334168434, |
|
"rewards/improved_len_reward_dast": 0.42184900864958763, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 2311.551025390625, |
|
"epoch": 0.2255346727154893, |
|
"grad_norm": 0.21075878937848122, |
|
"kl": 0.003543853759765625, |
|
"learning_rate": 9.579343839533668e-07, |
|
"loss": 0.0396, |
|
"reward": 0.420402854681015, |
|
"reward_std": 0.25258706137537956, |
|
"rewards/improved_len_reward_dast": 0.420402854681015, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 2205.050994873047, |
|
"epoch": 0.22812702527543746, |
|
"grad_norm": 0.17968161341123623, |
|
"kl": 0.0030517578125, |
|
"learning_rate": 9.561926902972378e-07, |
|
"loss": 0.0301, |
|
"reward": 0.5036411881446838, |
|
"reward_std": 0.24565044790506363, |
|
"rewards/improved_len_reward_dast": 0.5036411881446838, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 1845.0509338378906, |
|
"epoch": 0.23071937783538563, |
|
"grad_norm": 0.2117750181509081, |
|
"kl": 0.00276947021484375, |
|
"learning_rate": 9.544175096082838e-07, |
|
"loss": 0.0239, |
|
"reward": 0.5442821085453033, |
|
"reward_std": 0.2692565321922302, |
|
"rewards/improved_len_reward_dast": 0.5442821085453033, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 1978.9234313964844, |
|
"epoch": 0.23331173039533376, |
|
"grad_norm": 0.17014312125843986, |
|
"kl": 0.00310516357421875, |
|
"learning_rate": 9.526089882346172e-07, |
|
"loss": 0.0441, |
|
"reward": 0.45555737614631653, |
|
"reward_std": 0.2615541107952595, |
|
"rewards/improved_len_reward_dast": 0.45555737614631653, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 2018.1785583496094, |
|
"epoch": 0.23590408295528192, |
|
"grad_norm": 0.18461058936642788, |
|
"kl": 0.002346038818359375, |
|
"learning_rate": 9.507672752730001e-07, |
|
"loss": 0.036, |
|
"reward": 0.2985446793027222, |
|
"reward_std": 0.29198911786079407, |
|
"rewards/improved_len_reward_dast": 0.2985446793027222, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 2159.846954345703, |
|
"epoch": 0.23849643551523006, |
|
"grad_norm": 0.18210923909541535, |
|
"kl": 0.0027294158935546875, |
|
"learning_rate": 9.4889252255655e-07, |
|
"loss": 0.0328, |
|
"reward": 0.41519875079393387, |
|
"reward_std": 0.2224278450012207, |
|
"rewards/improved_len_reward_dast": 0.41519875079393387, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 1984.6275024414062, |
|
"epoch": 0.24108878807517822, |
|
"grad_norm": 0.2141749380576668, |
|
"kl": 0.00240325927734375, |
|
"learning_rate": 9.469848846422223e-07, |
|
"loss": -0.0175, |
|
"reward": 0.26735120080411434, |
|
"reward_std": 0.2445412389934063, |
|
"rewards/improved_len_reward_dast": 0.26735120080411434, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 2195.3775329589844, |
|
"epoch": 0.2436811406351264, |
|
"grad_norm": 0.1762045925262696, |
|
"kl": 0.00284576416015625, |
|
"learning_rate": 9.450445187980699e-07, |
|
"loss": -0.0218, |
|
"reward": 0.33138592168688774, |
|
"reward_std": 0.3225051313638687, |
|
"rewards/improved_len_reward_dast": 0.33138592168688774, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 2310.7040100097656, |
|
"epoch": 0.24627349319507452, |
|
"grad_norm": 0.15765048447372448, |
|
"kl": 0.00351715087890625, |
|
"learning_rate": 9.430715849902774e-07, |
|
"loss": 0.0401, |
|
"reward": 0.43523500859737396, |
|
"reward_std": 0.2462395802140236, |
|
"rewards/improved_len_reward_dast": 0.43523500859737396, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 1937.6631774902344, |
|
"epoch": 0.24886584575502269, |
|
"grad_norm": 0.20879458110841193, |
|
"kl": 0.00296783447265625, |
|
"learning_rate": 9.410662458699723e-07, |
|
"loss": 0.0195, |
|
"reward": 0.46912187337875366, |
|
"reward_std": 0.2178829275071621, |
|
"rewards/improved_len_reward_dast": 0.46912187337875366, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 1418.3111877441406, |
|
"epoch": 0.25145819831497085, |
|
"grad_norm": 0.19578242445789368, |
|
"kl": 0.002971649169921875, |
|
"learning_rate": 9.390286667598169e-07, |
|
"loss": 0.0235, |
|
"reward": 0.4482320174574852, |
|
"reward_std": 0.26671652123332024, |
|
"rewards/improved_len_reward_dast": 0.4482320174574852, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 1816.6275024414062, |
|
"epoch": 0.254050550874919, |
|
"grad_norm": 0.22258502785393502, |
|
"kl": 0.002674102783203125, |
|
"learning_rate": 9.369590156403784e-07, |
|
"loss": 0.0319, |
|
"reward": 0.4338858500123024, |
|
"reward_std": 0.26093247532844543, |
|
"rewards/improved_len_reward_dast": 0.4338858500123024, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 1874.5663146972656, |
|
"epoch": 0.2566429034348671, |
|
"grad_norm": 0.21715573777250466, |
|
"kl": 0.00341796875, |
|
"learning_rate": 9.348574631362808e-07, |
|
"loss": 0.0524, |
|
"reward": 0.49130718410015106, |
|
"reward_std": 0.2376222312450409, |
|
"rewards/improved_len_reward_dast": 0.49130718410015106, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 1574.3213806152344, |
|
"epoch": 0.2592352559948153, |
|
"grad_norm": 0.17861488086020622, |
|
"kl": 0.002559661865234375, |
|
"learning_rate": 9.327241825021379e-07, |
|
"loss": 0.0113, |
|
"reward": 0.6482488512992859, |
|
"reward_std": 0.18520537950098515, |
|
"rewards/improved_len_reward_dast": 0.6482488512992859, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 2049.6734619140625, |
|
"epoch": 0.26182760855476345, |
|
"grad_norm": 0.18885164201528534, |
|
"kl": 0.003215789794921875, |
|
"learning_rate": 9.3055934960827e-07, |
|
"loss": 0.0223, |
|
"reward": 0.4320410490036011, |
|
"reward_std": 0.2535330019891262, |
|
"rewards/improved_len_reward_dast": 0.4320410490036011, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 1969.6479187011719, |
|
"epoch": 0.2644199611147116, |
|
"grad_norm": 0.18844502513210584, |
|
"kl": 0.003604888916015625, |
|
"learning_rate": 9.283631429262053e-07, |
|
"loss": 0.0465, |
|
"reward": 0.4825830012559891, |
|
"reward_std": 0.2706633023917675, |
|
"rewards/improved_len_reward_dast": 0.4825830012559891, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 1973.2142333984375, |
|
"epoch": 0.2670123136746598, |
|
"grad_norm": 0.17577302935565667, |
|
"kl": 0.00371551513671875, |
|
"learning_rate": 9.261357435139665e-07, |
|
"loss": -0.0154, |
|
"reward": 0.2741311024874449, |
|
"reward_std": 0.2971944361925125, |
|
"rewards/improved_len_reward_dast": 0.2741311024874449, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 1952.841796875, |
|
"epoch": 0.2696046662346079, |
|
"grad_norm": 0.20674937772252322, |
|
"kl": 0.0036773681640625, |
|
"learning_rate": 9.238773350011437e-07, |
|
"loss": 0.0516, |
|
"reward": 0.5049019902944565, |
|
"reward_std": 0.23485657200217247, |
|
"rewards/improved_len_reward_dast": 0.5049019902944565, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 1959.0458679199219, |
|
"epoch": 0.27219701879455604, |
|
"grad_norm": 0.17037566789121353, |
|
"kl": 0.003337860107421875, |
|
"learning_rate": 9.215881035737557e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5687128752470016, |
|
"reward_std": 0.26400860771536827, |
|
"rewards/improved_len_reward_dast": 0.5687128752470016, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 2062.1377563476562, |
|
"epoch": 0.2747893713545042, |
|
"grad_norm": 0.18451392997367894, |
|
"kl": 0.003459930419921875, |
|
"learning_rate": 9.192682379589017e-07, |
|
"loss": 0.0431, |
|
"reward": 0.5168322995305061, |
|
"reward_std": 0.3160136565566063, |
|
"rewards/improved_len_reward_dast": 0.5168322995305061, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 2263.1478576660156, |
|
"epoch": 0.27738172391445237, |
|
"grad_norm": 0.14735079239054202, |
|
"kl": 0.0033721923828125, |
|
"learning_rate": 9.169179294092006e-07, |
|
"loss": 0.0215, |
|
"reward": 0.4763122648000717, |
|
"reward_std": 0.2259252481162548, |
|
"rewards/improved_len_reward_dast": 0.4763122648000717, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 1663.2958679199219, |
|
"epoch": 0.27997407647440054, |
|
"grad_norm": 0.18037575865019131, |
|
"kl": 0.003185272216796875, |
|
"learning_rate": 9.145373716870257e-07, |
|
"loss": -0.0085, |
|
"reward": 0.4951760992407799, |
|
"reward_std": 0.2437908910214901, |
|
"rewards/improved_len_reward_dast": 0.4951760992407799, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 2391.9795837402344, |
|
"epoch": 0.2825664290343487, |
|
"grad_norm": 0.19641500331835074, |
|
"kl": 0.00460052490234375, |
|
"learning_rate": 9.121267610485294e-07, |
|
"loss": 0.0161, |
|
"reward": 0.3704775348305702, |
|
"reward_std": 0.26634273678064346, |
|
"rewards/improved_len_reward_dast": 0.3704775348305702, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 1835.2346801757812, |
|
"epoch": 0.2851587815942968, |
|
"grad_norm": 0.16477117090344476, |
|
"kl": 0.00339508056640625, |
|
"learning_rate": 9.096862962274642e-07, |
|
"loss": -0.0073, |
|
"reward": 0.4292480945587158, |
|
"reward_std": 0.27501288428902626, |
|
"rewards/improved_len_reward_dast": 0.4292480945587158, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 2104.132568359375, |
|
"epoch": 0.28775113415424497, |
|
"grad_norm": 0.19390179121770934, |
|
"kl": 0.004150390625, |
|
"learning_rate": 9.072161784187988e-07, |
|
"loss": 0.0098, |
|
"reward": 0.3341917209327221, |
|
"reward_std": 0.30071910470724106, |
|
"rewards/improved_len_reward_dast": 0.3341917209327221, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 1651.9744262695312, |
|
"epoch": 0.29034348671419313, |
|
"grad_norm": 0.17112243727895823, |
|
"kl": 0.003452301025390625, |
|
"learning_rate": 9.047166112621312e-07, |
|
"loss": 0.0251, |
|
"reward": 0.5470812171697617, |
|
"reward_std": 0.21900975704193115, |
|
"rewards/improved_len_reward_dast": 0.5470812171697617, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 1929.290771484375, |
|
"epoch": 0.2929358392741413, |
|
"grad_norm": 0.15871059979904864, |
|
"kl": 0.004146575927734375, |
|
"learning_rate": 9.021878008249001e-07, |
|
"loss": -0.0091, |
|
"reward": 0.4469901919364929, |
|
"reward_std": 0.3005082905292511, |
|
"rewards/improved_len_reward_dast": 0.4469901919364929, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 1670.8672790527344, |
|
"epoch": 0.29552819183408946, |
|
"grad_norm": 0.20014729950778795, |
|
"kl": 0.00397491455078125, |
|
"learning_rate": 8.996299555853973e-07, |
|
"loss": 0.0048, |
|
"reward": 0.4352063946425915, |
|
"reward_std": 0.30053506791591644, |
|
"rewards/improved_len_reward_dast": 0.4352063946425915, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 1748.6427917480469, |
|
"epoch": 0.29812054439403757, |
|
"grad_norm": 0.19064081554515172, |
|
"kl": 0.003910064697265625, |
|
"learning_rate": 8.970432864155798e-07, |
|
"loss": 0.0279, |
|
"reward": 0.4250146150588989, |
|
"reward_std": 0.28123610466718674, |
|
"rewards/improved_len_reward_dast": 0.4250146150588989, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 1927.2347106933594, |
|
"epoch": 0.30071289695398573, |
|
"grad_norm": 0.19497710947535807, |
|
"kl": 0.00446319580078125, |
|
"learning_rate": 8.944280065636851e-07, |
|
"loss": -0.0085, |
|
"reward": 0.48724526911973953, |
|
"reward_std": 0.25488007813692093, |
|
"rewards/improved_len_reward_dast": 0.48724526911973953, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 1818.4541015625, |
|
"epoch": 0.3033052495139339, |
|
"grad_norm": 0.34088335134181214, |
|
"kl": 0.004119873046875, |
|
"learning_rate": 8.917843316366515e-07, |
|
"loss": -0.0105, |
|
"reward": 0.4738898351788521, |
|
"reward_std": 0.23307713866233826, |
|
"rewards/improved_len_reward_dast": 0.4738898351788521, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 2024.6478881835938, |
|
"epoch": 0.30589760207388206, |
|
"grad_norm": 0.21372750051520226, |
|
"kl": 0.005035400390625, |
|
"learning_rate": 8.891124795823426e-07, |
|
"loss": -0.0181, |
|
"reward": 0.3491591773927212, |
|
"reward_std": 0.23684632405638695, |
|
"rewards/improved_len_reward_dast": 0.3491591773927212, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 1981.9183349609375, |
|
"epoch": 0.3084899546338302, |
|
"grad_norm": 0.16944043573052223, |
|
"kl": 0.00438690185546875, |
|
"learning_rate": 8.864126706715796e-07, |
|
"loss": 0.0496, |
|
"reward": 0.5126907303929329, |
|
"reward_std": 0.186638955026865, |
|
"rewards/improved_len_reward_dast": 0.5126907303929329, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 2106.7754821777344, |
|
"epoch": 0.31108230719377833, |
|
"grad_norm": 0.17722812898313278, |
|
"kl": 0.0058135986328125, |
|
"learning_rate": 8.83685127479982e-07, |
|
"loss": 0.0527, |
|
"reward": 0.48729025572538376, |
|
"reward_std": 0.1900501400232315, |
|
"rewards/improved_len_reward_dast": 0.48729025572538376, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 1759.3673095703125, |
|
"epoch": 0.3136746597537265, |
|
"grad_norm": 0.18128869032474573, |
|
"kl": 0.0042724609375, |
|
"learning_rate": 8.809300748696173e-07, |
|
"loss": 0.0305, |
|
"reward": 0.45106371864676476, |
|
"reward_std": 0.2715425603091717, |
|
"rewards/improved_len_reward_dast": 0.45106371864676476, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 2413.795867919922, |
|
"epoch": 0.31626701231367466, |
|
"grad_norm": 0.18347422721089623, |
|
"kl": 0.00518035888671875, |
|
"learning_rate": 8.781477399704652e-07, |
|
"loss": 0.0274, |
|
"reward": 0.3711659908294678, |
|
"reward_std": 0.2643970772624016, |
|
"rewards/improved_len_reward_dast": 0.3711659908294678, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 2341.85205078125, |
|
"epoch": 0.3188593648736228, |
|
"grad_norm": 0.16392258093452686, |
|
"kl": 0.00547027587890625, |
|
"learning_rate": 8.753383521616902e-07, |
|
"loss": -0.0124, |
|
"reward": 0.4358869791030884, |
|
"reward_std": 0.25290554389357567, |
|
"rewards/improved_len_reward_dast": 0.4358869791030884, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 1800.4285278320312, |
|
"epoch": 0.321451717433571, |
|
"grad_norm": 0.19958740910637524, |
|
"kl": 0.004329681396484375, |
|
"learning_rate": 8.72502143052733e-07, |
|
"loss": -0.032, |
|
"reward": 0.38679035007953644, |
|
"reward_std": 0.24552029743790627, |
|
"rewards/improved_len_reward_dast": 0.38679035007953644, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 1991.841796875, |
|
"epoch": 0.32404406999351915, |
|
"grad_norm": 0.1577890471913444, |
|
"kl": 0.004730224609375, |
|
"learning_rate": 8.696393464642158e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5458214432001114, |
|
"reward_std": 0.23444852605462074, |
|
"rewards/improved_len_reward_dast": 0.5458214432001114, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 1776.3826293945312, |
|
"epoch": 0.32663642255346725, |
|
"grad_norm": 0.17200589761505877, |
|
"kl": 0.004123687744140625, |
|
"learning_rate": 8.667501984086655e-07, |
|
"loss": 0.0051, |
|
"reward": 0.5649043023586273, |
|
"reward_std": 0.22592130675911903, |
|
"rewards/improved_len_reward_dast": 0.5649043023586273, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 1802.2703857421875, |
|
"epoch": 0.3292287751134154, |
|
"grad_norm": 0.17760415544496286, |
|
"kl": 0.00440216064453125, |
|
"learning_rate": 8.638349370710573e-07, |
|
"loss": 0.0208, |
|
"reward": 0.4263976775109768, |
|
"reward_std": 0.23108776286244392, |
|
"rewards/improved_len_reward_dast": 0.4263976775109768, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 1714.5101623535156, |
|
"epoch": 0.3318211276733636, |
|
"grad_norm": 0.19240596915490546, |
|
"kl": 0.004238128662109375, |
|
"learning_rate": 8.608938027891775e-07, |
|
"loss": -0.0168, |
|
"reward": 0.5219497531652451, |
|
"reward_std": 0.28100451827049255, |
|
"rewards/improved_len_reward_dast": 0.5219497531652451, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 1703.540771484375, |
|
"epoch": 0.33441348023331174, |
|
"grad_norm": 0.1869791882546724, |
|
"kl": 0.004779815673828125, |
|
"learning_rate": 8.579270380338107e-07, |
|
"loss": 0.0213, |
|
"reward": 0.5599559545516968, |
|
"reward_std": 0.20541859790682793, |
|
"rewards/improved_len_reward_dast": 0.5599559545516968, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 2307.8009338378906, |
|
"epoch": 0.3370058327932599, |
|
"grad_norm": 0.16311538988092006, |
|
"kl": 0.005146026611328125, |
|
"learning_rate": 8.549348873887496e-07, |
|
"loss": 0.0135, |
|
"reward": 0.3861619606614113, |
|
"reward_std": 0.2727292329072952, |
|
"rewards/improved_len_reward_dast": 0.3861619606614113, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 1866.1173095703125, |
|
"epoch": 0.339598185353208, |
|
"grad_norm": 0.2056998520073749, |
|
"kl": 0.003955841064453125, |
|
"learning_rate": 8.519175975306312e-07, |
|
"loss": 0.0417, |
|
"reward": 0.24037051759660244, |
|
"reward_std": 0.3439597971737385, |
|
"rewards/improved_len_reward_dast": 0.24037051759660244, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 1911.8265075683594, |
|
"epoch": 0.3421905379131562, |
|
"grad_norm": 0.2317203557971351, |
|
"kl": 0.004306793212890625, |
|
"learning_rate": 8.48875417208601e-07, |
|
"loss": 0.0452, |
|
"reward": 0.47868431359529495, |
|
"reward_std": 0.23706890270113945, |
|
"rewards/improved_len_reward_dast": 0.47868431359529495, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 1986.4744567871094, |
|
"epoch": 0.34478289047310434, |
|
"grad_norm": 0.17738281540623924, |
|
"kl": 0.00514984130859375, |
|
"learning_rate": 8.458085972238048e-07, |
|
"loss": 0.0072, |
|
"reward": 0.28852372616529465, |
|
"reward_std": 0.2508459724485874, |
|
"rewards/improved_len_reward_dast": 0.28852372616529465, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 1994.0815734863281, |
|
"epoch": 0.3473752430330525, |
|
"grad_norm": 0.1997705536336717, |
|
"kl": 0.00603485107421875, |
|
"learning_rate": 8.427173904087138e-07, |
|
"loss": -0.0078, |
|
"reward": 0.36212442070245743, |
|
"reward_std": 0.2850674279034138, |
|
"rewards/improved_len_reward_dast": 0.36212442070245743, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 2514.382598876953, |
|
"epoch": 0.34996759559300067, |
|
"grad_norm": 0.15208989698192185, |
|
"kl": 0.00661468505859375, |
|
"learning_rate": 8.396020516062794e-07, |
|
"loss": 0.0032, |
|
"reward": 0.4105417560786009, |
|
"reward_std": 0.22361259534955025, |
|
"rewards/improved_len_reward_dast": 0.4105417560786009, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 1611.4693908691406, |
|
"epoch": 0.3525599481529488, |
|
"grad_norm": 0.19115046617358702, |
|
"kl": 0.00412750244140625, |
|
"learning_rate": 8.364628376489242e-07, |
|
"loss": 0.0441, |
|
"reward": 0.5174260064959526, |
|
"reward_std": 0.2693428471684456, |
|
"rewards/improved_len_reward_dast": 0.5174260064959526, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 1408.8061218261719, |
|
"epoch": 0.35515230071289694, |
|
"grad_norm": 0.1889385097712907, |
|
"kl": 0.00432586669921875, |
|
"learning_rate": 8.333000073373685e-07, |
|
"loss": -0.0013, |
|
"reward": 0.5440054759383202, |
|
"reward_std": 0.19834023714065552, |
|
"rewards/improved_len_reward_dast": 0.5440054759383202, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 1900.6836242675781, |
|
"epoch": 0.3577446532728451, |
|
"grad_norm": 0.1681565313739815, |
|
"kl": 0.00485992431640625, |
|
"learning_rate": 8.301138214192945e-07, |
|
"loss": -0.0184, |
|
"reward": 0.49239661544561386, |
|
"reward_std": 0.29014211893081665, |
|
"rewards/improved_len_reward_dast": 0.49239661544561386, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 1999.6376953125, |
|
"epoch": 0.36033700583279327, |
|
"grad_norm": 0.15882776364135623, |
|
"kl": 0.00577545166015625, |
|
"learning_rate": 8.269045425678497e-07, |
|
"loss": 0.0007, |
|
"reward": 0.49145303666591644, |
|
"reward_std": 0.23764513432979584, |
|
"rewards/improved_len_reward_dast": 0.49145303666591644, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 1802.9540405273438, |
|
"epoch": 0.36292935839274143, |
|
"grad_norm": 0.17674925264442287, |
|
"kl": 0.00507354736328125, |
|
"learning_rate": 8.236724353599918e-07, |
|
"loss": 0.0455, |
|
"reward": 0.5860550999641418, |
|
"reward_std": 0.27273107320070267, |
|
"rewards/improved_len_reward_dast": 0.5860550999641418, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 1764.2857360839844, |
|
"epoch": 0.36552171095268954, |
|
"grad_norm": 0.20452351841611824, |
|
"kl": 0.005157470703125, |
|
"learning_rate": 8.204177662546763e-07, |
|
"loss": 0.0623, |
|
"reward": 0.5012878403067589, |
|
"reward_std": 0.24189992249011993, |
|
"rewards/improved_len_reward_dast": 0.5012878403067589, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 1690.7754516601562, |
|
"epoch": 0.3681140635126377, |
|
"grad_norm": 0.20408361176282866, |
|
"kl": 0.005096435546875, |
|
"learning_rate": 8.171408035708906e-07, |
|
"loss": 0.0395, |
|
"reward": 0.5669431537389755, |
|
"reward_std": 0.21233107149600983, |
|
"rewards/improved_len_reward_dast": 0.5669431537389755, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 1466.1887512207031, |
|
"epoch": 0.37070641607258586, |
|
"grad_norm": 0.18540115151616593, |
|
"kl": 0.004268646240234375, |
|
"learning_rate": 8.138418174655323e-07, |
|
"loss": 0.0175, |
|
"reward": 0.5513587966561317, |
|
"reward_std": 0.23814994096755981, |
|
"rewards/improved_len_reward_dast": 0.5513587966561317, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 2337.9540405273438, |
|
"epoch": 0.37329876863253403, |
|
"grad_norm": 0.16911682150078117, |
|
"kl": 0.00580596923828125, |
|
"learning_rate": 8.105210799111366e-07, |
|
"loss": 0.0186, |
|
"reward": 0.4387804791331291, |
|
"reward_std": 0.22327708080410957, |
|
"rewards/improved_len_reward_dast": 0.4387804791331291, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 1876.8162536621094, |
|
"epoch": 0.3758911211924822, |
|
"grad_norm": 0.1658748038113864, |
|
"kl": 0.005420684814453125, |
|
"learning_rate": 8.071788646734564e-07, |
|
"loss": -0.0219, |
|
"reward": 0.49322987347841263, |
|
"reward_std": 0.17780348286032677, |
|
"rewards/improved_len_reward_dast": 0.49322987347841263, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 1766.6683654785156, |
|
"epoch": 0.37848347375243035, |
|
"grad_norm": 0.15094962051933156, |
|
"kl": 0.00490570068359375, |
|
"learning_rate": 8.038154472888909e-07, |
|
"loss": -0.0071, |
|
"reward": 0.5451386570930481, |
|
"reward_std": 0.24280473217368126, |
|
"rewards/improved_len_reward_dast": 0.5451386570930481, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 1868.7346801757812, |
|
"epoch": 0.38107582631237846, |
|
"grad_norm": 0.16172171063714397, |
|
"kl": 0.00629425048828125, |
|
"learning_rate": 8.004311050417711e-07, |
|
"loss": -0.0304, |
|
"reward": 0.5152218118309975, |
|
"reward_std": 0.2387254200875759, |
|
"rewards/improved_len_reward_dast": 0.5152218118309975, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 1790.6223754882812, |
|
"epoch": 0.3836681788723266, |
|
"grad_norm": 0.1743302771283738, |
|
"kl": 0.005558013916015625, |
|
"learning_rate": 7.970261169414999e-07, |
|
"loss": 0.0191, |
|
"reward": 0.48244544118642807, |
|
"reward_std": 0.22489535436034203, |
|
"rewards/improved_len_reward_dast": 0.48244544118642807, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 2116.790771484375, |
|
"epoch": 0.3862605314322748, |
|
"grad_norm": 0.17221394034022652, |
|
"kl": 0.00658416748046875, |
|
"learning_rate": 7.936007636995497e-07, |
|
"loss": 0.0313, |
|
"reward": 0.47787267714738846, |
|
"reward_std": 0.2161446176469326, |
|
"rewards/improved_len_reward_dast": 0.47787267714738846, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 1756.3367309570312, |
|
"epoch": 0.38885288399222295, |
|
"grad_norm": 0.21564957617935493, |
|
"kl": 0.005767822265625, |
|
"learning_rate": 7.901553277063213e-07, |
|
"loss": 0.0672, |
|
"reward": 0.3681907616555691, |
|
"reward_std": 0.2732224613428116, |
|
"rewards/improved_len_reward_dast": 0.3681907616555691, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 2132.2958984375, |
|
"epoch": 0.3914452365521711, |
|
"grad_norm": 0.21594042558519996, |
|
"kl": 0.00696563720703125, |
|
"learning_rate": 7.866900930078618e-07, |
|
"loss": 0.0453, |
|
"reward": 0.44704771041870117, |
|
"reward_std": 0.27593713626265526, |
|
"rewards/improved_len_reward_dast": 0.44704771041870117, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 1927.5152282714844, |
|
"epoch": 0.3940375891121192, |
|
"grad_norm": 0.1610545275313625, |
|
"kl": 0.0055389404296875, |
|
"learning_rate": 7.832053452824489e-07, |
|
"loss": 0.0042, |
|
"reward": 0.5329347252845764, |
|
"reward_std": 0.24699966236948967, |
|
"rewards/improved_len_reward_dast": 0.5329347252845764, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 1966.3009948730469, |
|
"epoch": 0.3966299416720674, |
|
"grad_norm": 0.16310455930995557, |
|
"kl": 0.00768280029296875, |
|
"learning_rate": 7.797013718170384e-07, |
|
"loss": -0.0202, |
|
"reward": 0.4772297702729702, |
|
"reward_std": 0.22153976559638977, |
|
"rewards/improved_len_reward_dast": 0.4772297702729702, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 1658.9846954345703, |
|
"epoch": 0.39922229423201555, |
|
"grad_norm": 0.20975108303665252, |
|
"kl": 0.005176544189453125, |
|
"learning_rate": 7.761784614835801e-07, |
|
"loss": 0.0531, |
|
"reward": 0.44522225111722946, |
|
"reward_std": 0.2452440857887268, |
|
"rewards/improved_len_reward_dast": 0.44522225111722946, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 1932.2856903076172, |
|
"epoch": 0.4018146467919637, |
|
"grad_norm": 0.1943564114200122, |
|
"kl": 0.0059356689453125, |
|
"learning_rate": 7.726369047152029e-07, |
|
"loss": -0.0237, |
|
"reward": 0.42587200179696083, |
|
"reward_std": 0.22648616321384907, |
|
"rewards/improved_len_reward_dast": 0.42587200179696083, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 1716.0968933105469, |
|
"epoch": 0.4044069993519119, |
|
"grad_norm": 0.15181408541122274, |
|
"kl": 0.004180908203125, |
|
"learning_rate": 7.690769934822712e-07, |
|
"loss": 0.0045, |
|
"reward": 0.5171971023082733, |
|
"reward_std": 0.2628367580473423, |
|
"rewards/improved_len_reward_dast": 0.5171971023082733, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 1752.392822265625, |
|
"epoch": 0.40699935191186, |
|
"grad_norm": 0.18038381295579756, |
|
"kl": 0.00592803955078125, |
|
"learning_rate": 7.654990212683142e-07, |
|
"loss": 0.0106, |
|
"reward": 0.5561209693551064, |
|
"reward_std": 0.2517809383571148, |
|
"rewards/improved_len_reward_dast": 0.5561209693551064, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 1883.0203247070312, |
|
"epoch": 0.40959170447180815, |
|
"grad_norm": 0.1789471226238438, |
|
"kl": 0.0065460205078125, |
|
"learning_rate": 7.619032830458307e-07, |
|
"loss": 0.0237, |
|
"reward": 0.5727267265319824, |
|
"reward_std": 0.2596842758357525, |
|
"rewards/improved_len_reward_dast": 0.5727267265319824, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 2159.1223754882812, |
|
"epoch": 0.4121840570317563, |
|
"grad_norm": 0.19892754911241722, |
|
"kl": 0.006744384765625, |
|
"learning_rate": 7.582900752519723e-07, |
|
"loss": 0.0183, |
|
"reward": 0.4676053449511528, |
|
"reward_std": 0.2507024519145489, |
|
"rewards/improved_len_reward_dast": 0.4676053449511528, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 2184.1122131347656, |
|
"epoch": 0.4147764095917045, |
|
"grad_norm": 0.19564213031602237, |
|
"kl": 0.0068359375, |
|
"learning_rate": 7.546596957641031e-07, |
|
"loss": 0.0021, |
|
"reward": 0.44793111085891724, |
|
"reward_std": 0.25819646567106247, |
|
"rewards/improved_len_reward_dast": 0.44793111085891724, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 2137.8213500976562, |
|
"epoch": 0.41736876215165264, |
|
"grad_norm": 0.18907736107190778, |
|
"kl": 0.0077056884765625, |
|
"learning_rate": 7.510124438752432e-07, |
|
"loss": 0.0214, |
|
"reward": 0.49345044791698456, |
|
"reward_std": 0.2429029531776905, |
|
"rewards/improved_len_reward_dast": 0.49345044791698456, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 1669.6938171386719, |
|
"epoch": 0.4199611147116008, |
|
"grad_norm": 0.1907714193828339, |
|
"kl": 0.0060577392578125, |
|
"learning_rate": 7.473486202693949e-07, |
|
"loss": 0.0435, |
|
"reward": 0.664195254445076, |
|
"reward_std": 0.20142245292663574, |
|
"rewards/improved_len_reward_dast": 0.664195254445076, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 1800.2550659179688, |
|
"epoch": 0.4225534672715489, |
|
"grad_norm": 0.20951161548060418, |
|
"kl": 0.00634002685546875, |
|
"learning_rate": 7.43668526996753e-07, |
|
"loss": 0.0197, |
|
"reward": 0.4569202698767185, |
|
"reward_std": 0.27463599294424057, |
|
"rewards/improved_len_reward_dast": 0.4569202698767185, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 1960.5714111328125, |
|
"epoch": 0.4251458198314971, |
|
"grad_norm": 0.17463868980210742, |
|
"kl": 0.00701141357421875, |
|
"learning_rate": 7.399724674488046e-07, |
|
"loss": -0.0162, |
|
"reward": 0.4838453456759453, |
|
"reward_std": 0.241712786257267, |
|
"rewards/improved_len_reward_dast": 0.4838453456759453, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 1959.688735961914, |
|
"epoch": 0.42773817239144524, |
|
"grad_norm": 0.1780914212189235, |
|
"kl": 0.00678253173828125, |
|
"learning_rate": 7.36260746333316e-07, |
|
"loss": 0.0377, |
|
"reward": 0.4423503875732422, |
|
"reward_std": 0.18371517956256866, |
|
"rewards/improved_len_reward_dast": 0.4423503875732422, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1861.6989440917969, |
|
"epoch": 0.4303305249513934, |
|
"grad_norm": 0.1834193572229538, |
|
"kl": 0.0063629150390625, |
|
"learning_rate": 7.325336696492128e-07, |
|
"loss": 0.0199, |
|
"reward": 0.577091321349144, |
|
"reward_std": 0.21984241902828217, |
|
"rewards/improved_len_reward_dast": 0.577091321349144, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 2081.637664794922, |
|
"epoch": 0.43292287751134156, |
|
"grad_norm": 0.16721105851518456, |
|
"kl": 0.00749969482421875, |
|
"learning_rate": 7.287915446613531e-07, |
|
"loss": 0.026, |
|
"reward": 0.45866213738918304, |
|
"reward_std": 0.20599739998579025, |
|
"rewards/improved_len_reward_dast": 0.45866213738918304, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 2199.9692993164062, |
|
"epoch": 0.43551523007128967, |
|
"grad_norm": 0.19137635279403245, |
|
"kl": 0.00868988037109375, |
|
"learning_rate": 7.250346798751953e-07, |
|
"loss": 0.0397, |
|
"reward": 0.40584760159254074, |
|
"reward_std": 0.32696348428726196, |
|
"rewards/improved_len_reward_dast": 0.40584760159254074, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 1805.3213958740234, |
|
"epoch": 0.43810758263123784, |
|
"grad_norm": 0.21092807084032023, |
|
"kl": 0.0059967041015625, |
|
"learning_rate": 7.212633850113662e-07, |
|
"loss": 0.0254, |
|
"reward": 0.4987664595246315, |
|
"reward_std": 0.2146843560039997, |
|
"rewards/improved_len_reward_dast": 0.4987664595246315, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 1496.5714111328125, |
|
"epoch": 0.440699935191186, |
|
"grad_norm": 0.209791586936155, |
|
"kl": 0.0062408447265625, |
|
"learning_rate": 7.174779709801253e-07, |
|
"loss": -0.0119, |
|
"reward": 0.4626496955752373, |
|
"reward_std": 0.27531013265252113, |
|
"rewards/improved_len_reward_dast": 0.4626496955752373, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 1733.1683044433594, |
|
"epoch": 0.44329228775113416, |
|
"grad_norm": 0.17464526469335237, |
|
"kl": 0.00637054443359375, |
|
"learning_rate": 7.136787498557344e-07, |
|
"loss": 0.0115, |
|
"reward": 0.47377418726682663, |
|
"reward_std": 0.31614845246076584, |
|
"rewards/improved_len_reward_dast": 0.47377418726682663, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 1834.5152893066406, |
|
"epoch": 0.4458846403110823, |
|
"grad_norm": 0.2156862895463155, |
|
"kl": 0.00728607177734375, |
|
"learning_rate": 7.098660348507293e-07, |
|
"loss": 0.0408, |
|
"reward": 0.4768953248858452, |
|
"reward_std": 0.24819828569889069, |
|
"rewards/improved_len_reward_dast": 0.4768953248858452, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 1937.4999694824219, |
|
"epoch": 0.44847699287103043, |
|
"grad_norm": 0.16621803638711263, |
|
"kl": 0.00614166259765625, |
|
"learning_rate": 7.060401402900977e-07, |
|
"loss": -0.0075, |
|
"reward": 0.41049597412347794, |
|
"reward_std": 0.295925073325634, |
|
"rewards/improved_len_reward_dast": 0.41049597412347794, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 1872.5152893066406, |
|
"epoch": 0.4510693454309786, |
|
"grad_norm": 0.17350342194775273, |
|
"kl": 0.0063323974609375, |
|
"learning_rate": 7.022013815853672e-07, |
|
"loss": -0.0071, |
|
"reward": 0.4474351555109024, |
|
"reward_std": 0.2942516505718231, |
|
"rewards/improved_len_reward_dast": 0.4474351555109024, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 1865.642822265625, |
|
"epoch": 0.45366169799092676, |
|
"grad_norm": 0.18659528012982252, |
|
"kl": 0.007171630859375, |
|
"learning_rate": 6.983500752086006e-07, |
|
"loss": 0.0382, |
|
"reward": 0.4795069247484207, |
|
"reward_std": 0.25236207991838455, |
|
"rewards/improved_len_reward_dast": 0.4795069247484207, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 1636.3724365234375, |
|
"epoch": 0.4562540505508749, |
|
"grad_norm": 0.23440488356464137, |
|
"kl": 0.00577545166015625, |
|
"learning_rate": 6.94486538666307e-07, |
|
"loss": 0.0499, |
|
"reward": 0.5080657936632633, |
|
"reward_std": 0.23907579854130745, |
|
"rewards/improved_len_reward_dast": 0.5080657936632633, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 1736.0662841796875, |
|
"epoch": 0.4588464031108231, |
|
"grad_norm": 0.1753900070568452, |
|
"kl": 0.00612640380859375, |
|
"learning_rate": 6.906110904732656e-07, |
|
"loss": 0.0149, |
|
"reward": 0.5958031266927719, |
|
"reward_std": 0.21545593440532684, |
|
"rewards/improved_len_reward_dast": 0.5958031266927719, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 2228.107147216797, |
|
"epoch": 0.46143875567077125, |
|
"grad_norm": 0.15717817851479735, |
|
"kl": 0.0067901611328125, |
|
"learning_rate": 6.867240501262666e-07, |
|
"loss": 0.0356, |
|
"reward": 0.4543240964412689, |
|
"reward_std": 0.22809231281280518, |
|
"rewards/improved_len_reward_dast": 0.4543240964412689, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 1626.4897766113281, |
|
"epoch": 0.46403110823071936, |
|
"grad_norm": 0.18177589779485429, |
|
"kl": 0.00614166259765625, |
|
"learning_rate": 6.828257380777723e-07, |
|
"loss": -0.0089, |
|
"reward": 0.31600036658346653, |
|
"reward_std": 0.3029540926218033, |
|
"rewards/improved_len_reward_dast": 0.31600036658346653, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 1958.8315734863281, |
|
"epoch": 0.4666234607906675, |
|
"grad_norm": 0.1755764816680188, |
|
"kl": 0.00868988037109375, |
|
"learning_rate": 6.789164757094978e-07, |
|
"loss": -0.0072, |
|
"reward": 0.49836502969264984, |
|
"reward_std": 0.23551873490214348, |
|
"rewards/improved_len_reward_dast": 0.49836502969264984, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 1990.5101623535156, |
|
"epoch": 0.4692158133506157, |
|
"grad_norm": 0.18931918239703327, |
|
"kl": 0.00846099853515625, |
|
"learning_rate": 6.749965853059164e-07, |
|
"loss": 0.0002, |
|
"reward": 0.4911562129855156, |
|
"reward_std": 0.3188675567507744, |
|
"rewards/improved_len_reward_dast": 0.4911562129855156, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 1658.4642028808594, |
|
"epoch": 0.47180816591056385, |
|
"grad_norm": 0.19209648070470173, |
|
"kl": 0.0060577392578125, |
|
"learning_rate": 6.710663900276903e-07, |
|
"loss": -0.0146, |
|
"reward": 0.43227584287524223, |
|
"reward_std": 0.2176770530641079, |
|
"rewards/improved_len_reward_dast": 0.43227584287524223, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 1583.1224365234375, |
|
"epoch": 0.474400518470512, |
|
"grad_norm": 0.2038966371816177, |
|
"kl": 0.00626373291015625, |
|
"learning_rate": 6.671262138850274e-07, |
|
"loss": 0.0399, |
|
"reward": 0.5645861923694611, |
|
"reward_std": 0.23070186376571655, |
|
"rewards/improved_len_reward_dast": 0.5645861923694611, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 1520.1683044433594, |
|
"epoch": 0.4769928710304601, |
|
"grad_norm": 0.18304431320557107, |
|
"kl": 0.006195068359375, |
|
"learning_rate": 6.631763817109717e-07, |
|
"loss": 0.0255, |
|
"reward": 0.5742315426468849, |
|
"reward_std": 0.23619792237877846, |
|
"rewards/improved_len_reward_dast": 0.5742315426468849, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 1594.3519897460938, |
|
"epoch": 0.4795852235904083, |
|
"grad_norm": 0.16992347566951083, |
|
"kl": 0.0060272216796875, |
|
"learning_rate": 6.592172191346218e-07, |
|
"loss": 0.0111, |
|
"reward": 0.5422097221016884, |
|
"reward_std": 0.25979943573474884, |
|
"rewards/improved_len_reward_dast": 0.5422097221016884, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 1702.4183654785156, |
|
"epoch": 0.48217757615035645, |
|
"grad_norm": 0.20895915866852888, |
|
"kl": 0.00782012939453125, |
|
"learning_rate": 6.552490525542864e-07, |
|
"loss": -0.0109, |
|
"reward": 0.49483248591423035, |
|
"reward_std": 0.18612295389175415, |
|
"rewards/improved_len_reward_dast": 0.49483248591423035, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 1660.1836547851562, |
|
"epoch": 0.4847699287103046, |
|
"grad_norm": 0.177095187259404, |
|
"kl": 0.00647735595703125, |
|
"learning_rate": 6.512722091105757e-07, |
|
"loss": 0.0079, |
|
"reward": 0.41740237921476364, |
|
"reward_std": 0.3016924597322941, |
|
"rewards/improved_len_reward_dast": 0.41740237921476364, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1430.4846649169922, |
|
"epoch": 0.4873622812702528, |
|
"grad_norm": 0.1856358739213989, |
|
"kl": 0.00666046142578125, |
|
"learning_rate": 6.472870166594314e-07, |
|
"loss": 0.0043, |
|
"reward": 0.6006389036774635, |
|
"reward_std": 0.20018238201737404, |
|
"rewards/improved_len_reward_dast": 0.6006389036774635, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 1959.5305786132812, |
|
"epoch": 0.4899546338302009, |
|
"grad_norm": 0.2109811675895551, |
|
"kl": 0.00728607177734375, |
|
"learning_rate": 6.432938037450974e-07, |
|
"loss": 0.0321, |
|
"reward": 0.4208461381494999, |
|
"reward_std": 0.25322337821125984, |
|
"rewards/improved_len_reward_dast": 0.4208461381494999, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 1917.4489135742188, |
|
"epoch": 0.49254698639014904, |
|
"grad_norm": 0.17048136291778715, |
|
"kl": 0.00824737548828125, |
|
"learning_rate": 6.392928995730352e-07, |
|
"loss": 0.0145, |
|
"reward": 0.4505029022693634, |
|
"reward_std": 0.2619488127529621, |
|
"rewards/improved_len_reward_dast": 0.4505029022693634, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 1402.14794921875, |
|
"epoch": 0.4951393389500972, |
|
"grad_norm": 0.18497170613874878, |
|
"kl": 0.00598907470703125, |
|
"learning_rate": 6.352846339827826e-07, |
|
"loss": -0.0062, |
|
"reward": 0.5656020939350128, |
|
"reward_std": 0.21923119574785233, |
|
"rewards/improved_len_reward_dast": 0.5656020939350128, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 1465.3367004394531, |
|
"epoch": 0.49773169151004537, |
|
"grad_norm": 0.21257458462794812, |
|
"kl": 0.00757598876953125, |
|
"learning_rate": 6.312693374207627e-07, |
|
"loss": 0.0312, |
|
"reward": 0.5634729117155075, |
|
"reward_std": 0.2284911945462227, |
|
"rewards/improved_len_reward_dast": 0.5634729117155075, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 1696.1326293945312, |
|
"epoch": 0.5003240440699935, |
|
"grad_norm": 0.1917176386219813, |
|
"kl": 0.00817108154296875, |
|
"learning_rate": 6.272473409130397e-07, |
|
"loss": -0.0122, |
|
"reward": 0.5527424663305283, |
|
"reward_std": 0.21831231378018856, |
|
"rewards/improved_len_reward_dast": 0.5527424663305283, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 1484.0254821777344, |
|
"epoch": 0.5029163966299417, |
|
"grad_norm": 0.18489831646493407, |
|
"kl": 0.00533294677734375, |
|
"learning_rate": 6.232189760380301e-07, |
|
"loss": -0.0092, |
|
"reward": 0.5197786688804626, |
|
"reward_std": 0.2706274203956127, |
|
"rewards/improved_len_reward_dast": 0.5197786688804626, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 1797.8571166992188, |
|
"epoch": 0.5055087491898899, |
|
"grad_norm": 0.2022876784231448, |
|
"kl": 0.0066070556640625, |
|
"learning_rate": 6.191845748991671e-07, |
|
"loss": 0.0034, |
|
"reward": 0.5193638280034065, |
|
"reward_std": 0.15613215044140816, |
|
"rewards/improved_len_reward_dast": 0.5193638280034065, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 1814.0611877441406, |
|
"epoch": 0.508101101749838, |
|
"grad_norm": 0.16795830607580578, |
|
"kl": 0.00803375244140625, |
|
"learning_rate": 6.151444700975203e-07, |
|
"loss": 0.0087, |
|
"reward": 0.5806097835302353, |
|
"reward_std": 0.21784314513206482, |
|
"rewards/improved_len_reward_dast": 0.5806097835302353, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 2092.0663146972656, |
|
"epoch": 0.5106934543097861, |
|
"grad_norm": 0.18162024484286257, |
|
"kl": 0.007904052734375, |
|
"learning_rate": 6.110989947043767e-07, |
|
"loss": 0.0292, |
|
"reward": 0.3626396246254444, |
|
"reward_std": 0.2863907441496849, |
|
"rewards/improved_len_reward_dast": 0.3626396246254444, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 1512.7142333984375, |
|
"epoch": 0.5132858068697342, |
|
"grad_norm": 0.25742266689895515, |
|
"kl": 0.0079498291015625, |
|
"learning_rate": 6.070484822337816e-07, |
|
"loss": 0.0789, |
|
"reward": 0.4790092930197716, |
|
"reward_std": 0.24650050699710846, |
|
"rewards/improved_len_reward_dast": 0.4790092930197716, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 1804.9438171386719, |
|
"epoch": 0.5158781594296824, |
|
"grad_norm": 0.1817030599796311, |
|
"kl": 0.00815582275390625, |
|
"learning_rate": 6.029932666150431e-07, |
|
"loss": 0.0508, |
|
"reward": 0.45782896876335144, |
|
"reward_std": 0.257952194660902, |
|
"rewards/improved_len_reward_dast": 0.45782896876335144, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 1632.5561218261719, |
|
"epoch": 0.5184705119896306, |
|
"grad_norm": 0.1682470886081435, |
|
"kl": 0.00696563720703125, |
|
"learning_rate": 5.989336821652029e-07, |
|
"loss": 0.0123, |
|
"reward": 0.49326401203870773, |
|
"reward_std": 0.2727998159825802, |
|
"rewards/improved_len_reward_dast": 0.49326401203870773, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1362.3570861816406, |
|
"epoch": 0.5210628645495787, |
|
"grad_norm": 0.2140704882737376, |
|
"kl": 0.00643157958984375, |
|
"learning_rate": 5.948700635614745e-07, |
|
"loss": 0.0187, |
|
"reward": 0.30808811727911234, |
|
"reward_std": 0.31461621820926666, |
|
"rewards/improved_len_reward_dast": 0.30808811727911234, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1429.9183349609375, |
|
"epoch": 0.5236552171095269, |
|
"grad_norm": 0.19196609876831122, |
|
"kl": 0.0064544677734375, |
|
"learning_rate": 5.908027458136518e-07, |
|
"loss": 0.0442, |
|
"reward": 0.6718090772628784, |
|
"reward_std": 0.19283609837293625, |
|
"rewards/improved_len_reward_dast": 0.6718090772628784, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1489.9336242675781, |
|
"epoch": 0.5262475696694751, |
|
"grad_norm": 0.15253854414767434, |
|
"kl": 0.00498199462890625, |
|
"learning_rate": 5.867320642364916e-07, |
|
"loss": 0.013, |
|
"reward": 0.6136546954512596, |
|
"reward_std": 0.23966671898961067, |
|
"rewards/improved_len_reward_dast": 0.6136546954512596, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 1689.6377563476562, |
|
"epoch": 0.5288399222294232, |
|
"grad_norm": 0.1927055929645657, |
|
"kl": 0.00634002685546875, |
|
"learning_rate": 5.826583544220678e-07, |
|
"loss": -0.0038, |
|
"reward": 0.49707163125276566, |
|
"reward_std": 0.2744743190705776, |
|
"rewards/improved_len_reward_dast": 0.49707163125276566, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 1282.392837524414, |
|
"epoch": 0.5314322747893714, |
|
"grad_norm": 0.17997245193338135, |
|
"kl": 0.0058441162109375, |
|
"learning_rate": 5.78581952212107e-07, |
|
"loss": 0.0106, |
|
"reward": 0.5941964462399483, |
|
"reward_std": 0.18648012727499008, |
|
"rewards/improved_len_reward_dast": 0.5941964462399483, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 1697.3724670410156, |
|
"epoch": 0.5340246273493195, |
|
"grad_norm": 0.17197705412944048, |
|
"kl": 0.00629425048828125, |
|
"learning_rate": 5.745031936702997e-07, |
|
"loss": -0.0017, |
|
"reward": 0.47735022753477097, |
|
"reward_std": 0.23548034578561783, |
|
"rewards/improved_len_reward_dast": 0.47735022753477097, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1744.994873046875, |
|
"epoch": 0.5366169799092677, |
|
"grad_norm": 0.17998891738721526, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 5.704224150545956e-07, |
|
"loss": 0.015, |
|
"reward": 0.41617942601442337, |
|
"reward_std": 0.2671221233904362, |
|
"rewards/improved_len_reward_dast": 0.41617942601442337, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 1260.086685180664, |
|
"epoch": 0.5392093324692158, |
|
"grad_norm": 0.20122932764023702, |
|
"kl": 0.005645751953125, |
|
"learning_rate": 5.663399527894816e-07, |
|
"loss": 0.0215, |
|
"reward": 0.6909545063972473, |
|
"reward_std": 0.20719094015657902, |
|
"rewards/improved_len_reward_dast": 0.6909545063972473, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 1616.2958984375, |
|
"epoch": 0.5418016850291639, |
|
"grad_norm": 0.1906417060993576, |
|
"kl": 0.00637054443359375, |
|
"learning_rate": 5.622561434382467e-07, |
|
"loss": 0.0233, |
|
"reward": 0.45805612206459045, |
|
"reward_std": 0.25512534007430077, |
|
"rewards/improved_len_reward_dast": 0.45805612206459045, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 2039.3724365234375, |
|
"epoch": 0.5443940375891121, |
|
"grad_norm": 0.19498631453970255, |
|
"kl": 0.00815582275390625, |
|
"learning_rate": 5.581713236752361e-07, |
|
"loss": -0.0184, |
|
"reward": 0.46333739161491394, |
|
"reward_std": 0.27003057673573494, |
|
"rewards/improved_len_reward_dast": 0.46333739161491394, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 1523.2754821777344, |
|
"epoch": 0.5469863901490603, |
|
"grad_norm": 0.20159676101246632, |
|
"kl": 0.006622314453125, |
|
"learning_rate": 5.540858302580934e-07, |
|
"loss": 0.0411, |
|
"reward": 0.5782179459929466, |
|
"reward_std": 0.2389019876718521, |
|
"rewards/improved_len_reward_dast": 0.5782179459929466, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 1725.341796875, |
|
"epoch": 0.5495787427090084, |
|
"grad_norm": 0.18132979624264559, |
|
"kl": 0.0070343017578125, |
|
"learning_rate": 5.5e-07, |
|
"loss": -0.0379, |
|
"reward": 0.2929552085697651, |
|
"reward_std": 0.2936428487300873, |
|
"rewards/improved_len_reward_dast": 0.2929552085697651, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1522.3009948730469, |
|
"epoch": 0.5521710952689566, |
|
"grad_norm": 1.2243596579933262, |
|
"kl": 0.01662445068359375, |
|
"learning_rate": 5.459141697419066e-07, |
|
"loss": 0.0108, |
|
"reward": 0.5409562550485134, |
|
"reward_std": 0.22488684952259064, |
|
"rewards/improved_len_reward_dast": 0.5409562550485134, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 1319.0765075683594, |
|
"epoch": 0.5547634478289047, |
|
"grad_norm": 0.1722252949208986, |
|
"kl": 0.00470733642578125, |
|
"learning_rate": 5.418286763247641e-07, |
|
"loss": 0.0069, |
|
"reward": 0.6275194361805916, |
|
"reward_std": 0.2474201563745737, |
|
"rewards/improved_len_reward_dast": 0.6275194361805916, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 1788.4846801757812, |
|
"epoch": 0.5573558003888529, |
|
"grad_norm": 1.2611041670931815, |
|
"kl": 0.013458251953125, |
|
"learning_rate": 5.377438565617532e-07, |
|
"loss": -0.0125, |
|
"reward": 0.45079565048217773, |
|
"reward_std": 0.280511736869812, |
|
"rewards/improved_len_reward_dast": 0.45079565048217773, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 2106.933624267578, |
|
"epoch": 0.5599481529488011, |
|
"grad_norm": 0.19693707615605538, |
|
"kl": 0.00965118408203125, |
|
"learning_rate": 5.336600472105186e-07, |
|
"loss": 0.0099, |
|
"reward": 0.44794752448797226, |
|
"reward_std": 0.2532258592545986, |
|
"rewards/improved_len_reward_dast": 0.44794752448797226, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 1594.5458984375, |
|
"epoch": 0.5625405055087492, |
|
"grad_norm": 0.18133552112289136, |
|
"kl": 0.0072021484375, |
|
"learning_rate": 5.295775849454045e-07, |
|
"loss": 0.0022, |
|
"reward": 0.35640399530529976, |
|
"reward_std": 0.22871940955519676, |
|
"rewards/improved_len_reward_dast": 0.35640399530529976, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 2063.7244262695312, |
|
"epoch": 0.5651328580686974, |
|
"grad_norm": 0.1653596633833327, |
|
"kl": 0.0097503662109375, |
|
"learning_rate": 5.254968063297003e-07, |
|
"loss": -0.0067, |
|
"reward": 0.39714931696653366, |
|
"reward_std": 0.2558470740914345, |
|
"rewards/improved_len_reward_dast": 0.39714931696653366, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 1957.7550659179688, |
|
"epoch": 0.5677252106286454, |
|
"grad_norm": 0.17831099916308876, |
|
"kl": 0.008544921875, |
|
"learning_rate": 5.214180477878931e-07, |
|
"loss": 0.0054, |
|
"reward": 0.4488506466150284, |
|
"reward_std": 0.28022897988557816, |
|
"rewards/improved_len_reward_dast": 0.4488506466150284, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 2230.1275024414062, |
|
"epoch": 0.5703175631885936, |
|
"grad_norm": 0.23765533370039002, |
|
"kl": 0.01194000244140625, |
|
"learning_rate": 5.173416455779323e-07, |
|
"loss": -0.0122, |
|
"reward": 0.4094167836010456, |
|
"reward_std": 0.2675712872296572, |
|
"rewards/improved_len_reward_dast": 0.4094167836010456, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 1626.5203857421875, |
|
"epoch": 0.5729099157485418, |
|
"grad_norm": 0.2314899179996737, |
|
"kl": 0.00766754150390625, |
|
"learning_rate": 5.132679357635086e-07, |
|
"loss": 0.0299, |
|
"reward": 0.46570489555597305, |
|
"reward_std": 0.21487142890691757, |
|
"rewards/improved_len_reward_dast": 0.46570489555597305, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 1790.2397766113281, |
|
"epoch": 0.5755022683084899, |
|
"grad_norm": 0.1860804689790846, |
|
"kl": 0.00777435302734375, |
|
"learning_rate": 5.091972541863481e-07, |
|
"loss": 0.0162, |
|
"reward": 0.42583951354026794, |
|
"reward_std": 0.28225597366690636, |
|
"rewards/improved_len_reward_dast": 0.42583951354026794, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 1288.7959289550781, |
|
"epoch": 0.5780946208684381, |
|
"grad_norm": 0.20669865534482024, |
|
"kl": 0.0060882568359375, |
|
"learning_rate": 5.051299364385257e-07, |
|
"loss": 0.0231, |
|
"reward": 0.5595748201012611, |
|
"reward_std": 0.2434106133878231, |
|
"rewards/improved_len_reward_dast": 0.5595748201012611, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 2255.5509643554688, |
|
"epoch": 0.5806869734283863, |
|
"grad_norm": 0.18272410031873515, |
|
"kl": 0.01136016845703125, |
|
"learning_rate": 5.010663178347971e-07, |
|
"loss": 0.0234, |
|
"reward": 0.47463729977607727, |
|
"reward_std": 0.2821599170565605, |
|
"rewards/improved_len_reward_dast": 0.47463729977607727, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1452.3367004394531, |
|
"epoch": 0.5832793259883344, |
|
"grad_norm": 0.191897675765259, |
|
"kl": 0.007843017578125, |
|
"learning_rate": 4.970067333849568e-07, |
|
"loss": 0.0344, |
|
"reward": 0.502272866666317, |
|
"reward_std": 0.22649240121245384, |
|
"rewards/improved_len_reward_dast": 0.502272866666317, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 1654.5408020019531, |
|
"epoch": 0.5858716785482826, |
|
"grad_norm": 3.553604338224093, |
|
"kl": 0.09326171875, |
|
"learning_rate": 4.929515177662182e-07, |
|
"loss": 0.0222, |
|
"reward": 0.47584168612957, |
|
"reward_std": 0.2622257173061371, |
|
"rewards/improved_len_reward_dast": 0.47584168612957, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 1871.4132385253906, |
|
"epoch": 0.5884640311082308, |
|
"grad_norm": 0.1695314613295998, |
|
"kl": 0.0087432861328125, |
|
"learning_rate": 4.889010052956233e-07, |
|
"loss": 0.0052, |
|
"reward": 0.5272805392742157, |
|
"reward_std": 0.2442457675933838, |
|
"rewards/improved_len_reward_dast": 0.5272805392742157, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1353.8213958740234, |
|
"epoch": 0.5910563836681789, |
|
"grad_norm": 0.18264539162380725, |
|
"kl": 0.0059661865234375, |
|
"learning_rate": 4.848555299024798e-07, |
|
"loss": 0.0225, |
|
"reward": 0.6240374892950058, |
|
"reward_std": 0.1938109789043665, |
|
"rewards/improved_len_reward_dast": 0.6240374892950058, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 1727.3112182617188, |
|
"epoch": 0.593648736228127, |
|
"grad_norm": 0.1734617136793063, |
|
"kl": 0.00876617431640625, |
|
"learning_rate": 4.80815425100833e-07, |
|
"loss": 0.0211, |
|
"reward": 0.5374634936451912, |
|
"reward_std": 0.22801294550299644, |
|
"rewards/improved_len_reward_dast": 0.5374634936451912, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 1789.8826293945312, |
|
"epoch": 0.5962410887880751, |
|
"grad_norm": 0.18912805948764874, |
|
"kl": 0.0084228515625, |
|
"learning_rate": 4.7678102396196983e-07, |
|
"loss": -0.0126, |
|
"reward": 0.4900341257452965, |
|
"reward_std": 0.22806890308856964, |
|
"rewards/improved_len_reward_dast": 0.4900341257452965, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1693.0509948730469, |
|
"epoch": 0.5988334413480233, |
|
"grad_norm": 0.17597707523302372, |
|
"kl": 0.00720977783203125, |
|
"learning_rate": 4.727526590869605e-07, |
|
"loss": -0.012, |
|
"reward": 0.47888991981744766, |
|
"reward_std": 0.2167413793504238, |
|
"rewards/improved_len_reward_dast": 0.47888991981744766, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1440.198959350586, |
|
"epoch": 0.6014257939079715, |
|
"grad_norm": 0.3232116423618759, |
|
"kl": 0.00661468505859375, |
|
"learning_rate": 4.6873066257923735e-07, |
|
"loss": -0.0066, |
|
"reward": 0.4444720149040222, |
|
"reward_std": 0.1700468622148037, |
|
"rewards/improved_len_reward_dast": 0.4444720149040222, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 2216.800994873047, |
|
"epoch": 0.6040181464679196, |
|
"grad_norm": 0.22016095164256272, |
|
"kl": 0.00894927978515625, |
|
"learning_rate": 4.647153660172173e-07, |
|
"loss": 0.0621, |
|
"reward": 0.46872628480196, |
|
"reward_std": 0.26834653317928314, |
|
"rewards/improved_len_reward_dast": 0.46872628480196, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 1988.7550659179688, |
|
"epoch": 0.6066104990278678, |
|
"grad_norm": 0.20038227395204466, |
|
"kl": 0.01000213623046875, |
|
"learning_rate": 4.607071004269647e-07, |
|
"loss": 0.0274, |
|
"reward": 0.5112068131566048, |
|
"reward_std": 0.2712997607886791, |
|
"rewards/improved_len_reward_dast": 0.5112068131566048, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 1631.9132385253906, |
|
"epoch": 0.609202851587816, |
|
"grad_norm": 0.20175936218365448, |
|
"kl": 0.00800323486328125, |
|
"learning_rate": 4.567061962549025e-07, |
|
"loss": -0.0159, |
|
"reward": 0.5118747428059578, |
|
"reward_std": 0.27685124427080154, |
|
"rewards/improved_len_reward_dast": 0.5118747428059578, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 1950.7857055664062, |
|
"epoch": 0.6117952041477641, |
|
"grad_norm": 0.18134020080561208, |
|
"kl": 0.0106353759765625, |
|
"learning_rate": 4.527129833405687e-07, |
|
"loss": 0.0038, |
|
"reward": 0.5120773538947105, |
|
"reward_std": 0.20266427472233772, |
|
"rewards/improved_len_reward_dast": 0.5120773538947105, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1583.6836242675781, |
|
"epoch": 0.6143875567077123, |
|
"grad_norm": 0.2073936085236417, |
|
"kl": 0.007110595703125, |
|
"learning_rate": 4.4872779088942425e-07, |
|
"loss": 0.0249, |
|
"reward": 0.5029871687293053, |
|
"reward_std": 0.29201821237802505, |
|
"rewards/improved_len_reward_dast": 0.5029871687293053, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 1926.5203857421875, |
|
"epoch": 0.6169799092676604, |
|
"grad_norm": 0.1774462909740951, |
|
"kl": 0.0105438232421875, |
|
"learning_rate": 4.447509474457135e-07, |
|
"loss": 0.0247, |
|
"reward": 0.5557538792490959, |
|
"reward_std": 0.26750218868255615, |
|
"rewards/improved_len_reward_dast": 0.5557538792490959, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 1847.1377563476562, |
|
"epoch": 0.6195722618276086, |
|
"grad_norm": 0.19817543454609501, |
|
"kl": 0.00870513916015625, |
|
"learning_rate": 4.4078278086537823e-07, |
|
"loss": 0.0044, |
|
"reward": 0.5904448255896568, |
|
"reward_std": 0.2602488324046135, |
|
"rewards/improved_len_reward_dast": 0.5904448255896568, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 1411.2958984375, |
|
"epoch": 0.6221646143875567, |
|
"grad_norm": 0.1766269163216527, |
|
"kl": 0.0061492919921875, |
|
"learning_rate": 4.3682361828902846e-07, |
|
"loss": 0.0087, |
|
"reward": 0.5749641954898834, |
|
"reward_std": 0.2569588888436556, |
|
"rewards/improved_len_reward_dast": 0.5749641954898834, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1658.341812133789, |
|
"epoch": 0.6247569669475048, |
|
"grad_norm": 0.18786331552156418, |
|
"kl": 0.00792694091796875, |
|
"learning_rate": 4.328737861149726e-07, |
|
"loss": 0.0186, |
|
"reward": 0.33391743153333664, |
|
"reward_std": 0.2631943728774786, |
|
"rewards/improved_len_reward_dast": 0.33391743153333664, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1752.903060913086, |
|
"epoch": 0.627349319507453, |
|
"grad_norm": 0.16247838117797347, |
|
"kl": 0.00801849365234375, |
|
"learning_rate": 4.289336099723098e-07, |
|
"loss": -0.0073, |
|
"reward": 0.5536581799387932, |
|
"reward_std": 0.1957964338362217, |
|
"rewards/improved_len_reward_dast": 0.5536581799387932, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 1809.596923828125, |
|
"epoch": 0.6299416720674011, |
|
"grad_norm": 0.2161692721743295, |
|
"kl": 0.009197235107421875, |
|
"learning_rate": 4.250034146940834e-07, |
|
"loss": 0.0363, |
|
"reward": 0.5511080101132393, |
|
"reward_std": 0.18104272708296776, |
|
"rewards/improved_len_reward_dast": 0.5511080101132393, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 1582.2550964355469, |
|
"epoch": 0.6325340246273493, |
|
"grad_norm": 0.19382955679222108, |
|
"kl": 0.00809478759765625, |
|
"learning_rate": 4.210835242905023e-07, |
|
"loss": 0.0326, |
|
"reward": 0.5464158207178116, |
|
"reward_std": 0.25605272501707077, |
|
"rewards/improved_len_reward_dast": 0.5464158207178116, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1810.8673400878906, |
|
"epoch": 0.6351263771872975, |
|
"grad_norm": 0.19640394391195734, |
|
"kl": 0.00948333740234375, |
|
"learning_rate": 4.1717426192222784e-07, |
|
"loss": 0.0292, |
|
"reward": 0.5271303877234459, |
|
"reward_std": 0.20934263616800308, |
|
"rewards/improved_len_reward_dast": 0.5271303877234459, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 2071.0305786132812, |
|
"epoch": 0.6377187297472456, |
|
"grad_norm": 0.2180548556252771, |
|
"kl": 0.00945281982421875, |
|
"learning_rate": 4.1327594987373347e-07, |
|
"loss": 0.0067, |
|
"reward": 0.38764588721096516, |
|
"reward_std": 0.23858479037880898, |
|
"rewards/improved_len_reward_dast": 0.38764588721096516, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 2013.1223754882812, |
|
"epoch": 0.6403110823071938, |
|
"grad_norm": 0.1930442243491151, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 4.0938890952673443e-07, |
|
"loss": -0.0222, |
|
"reward": 0.46171685308218, |
|
"reward_std": 0.18625032529234886, |
|
"rewards/improved_len_reward_dast": 0.46171685308218, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 1950.23974609375, |
|
"epoch": 0.642903434867142, |
|
"grad_norm": 0.17065165751997421, |
|
"kl": 0.01047515869140625, |
|
"learning_rate": 4.05513461333693e-07, |
|
"loss": 0.008, |
|
"reward": 0.48094385862350464, |
|
"reward_std": 0.25418727472424507, |
|
"rewards/improved_len_reward_dast": 0.48094385862350464, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1824.0305480957031, |
|
"epoch": 0.6454957874270901, |
|
"grad_norm": 0.17991006553556818, |
|
"kl": 0.0101165771484375, |
|
"learning_rate": 4.016499247913994e-07, |
|
"loss": 0.0192, |
|
"reward": 0.517502948641777, |
|
"reward_std": 0.22622046247124672, |
|
"rewards/improved_len_reward_dast": 0.517502948641777, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1910.5152587890625, |
|
"epoch": 0.6480881399870383, |
|
"grad_norm": 0.17870026031791122, |
|
"kl": 0.0102386474609375, |
|
"learning_rate": 3.977986184146328e-07, |
|
"loss": 0.0037, |
|
"reward": 0.6127093955874443, |
|
"reward_std": 0.24417436867952347, |
|
"rewards/improved_len_reward_dast": 0.6127093955874443, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 1963.0713500976562, |
|
"epoch": 0.6506804925469863, |
|
"grad_norm": 0.17652108057579807, |
|
"kl": 0.00982666015625, |
|
"learning_rate": 3.939598597099022e-07, |
|
"loss": -0.0145, |
|
"reward": 0.31848039478063583, |
|
"reward_std": 0.29098184034228325, |
|
"rewards/improved_len_reward_dast": 0.31848039478063583, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1713.6223754882812, |
|
"epoch": 0.6532728451069345, |
|
"grad_norm": 0.17142346849906642, |
|
"kl": 0.00846099853515625, |
|
"learning_rate": 3.9013396514927076e-07, |
|
"loss": 0.0119, |
|
"reward": 0.47325168550014496, |
|
"reward_std": 0.24261003732681274, |
|
"rewards/improved_len_reward_dast": 0.47325168550014496, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 2062.188751220703, |
|
"epoch": 0.6558651976668827, |
|
"grad_norm": 0.16335646698761644, |
|
"kl": 0.0091094970703125, |
|
"learning_rate": 3.8632125014426566e-07, |
|
"loss": 0.0001, |
|
"reward": 0.40878694504499435, |
|
"reward_std": 0.24995128065347672, |
|
"rewards/improved_len_reward_dast": 0.40878694504499435, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 2270.3978881835938, |
|
"epoch": 0.6584575502268308, |
|
"grad_norm": 0.16560269347700182, |
|
"kl": 0.0117950439453125, |
|
"learning_rate": 3.8252202901987474e-07, |
|
"loss": 0.0271, |
|
"reward": 0.46808916330337524, |
|
"reward_std": 0.21613015979528427, |
|
"rewards/improved_len_reward_dast": 0.46808916330337524, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 1911.0305786132812, |
|
"epoch": 0.661049902786779, |
|
"grad_norm": 0.18694188171422124, |
|
"kl": 0.0100860595703125, |
|
"learning_rate": 3.7873661498863384e-07, |
|
"loss": -0.0122, |
|
"reward": 0.5386775732040405, |
|
"reward_std": 0.267782025039196, |
|
"rewards/improved_len_reward_dast": 0.5386775732040405, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 1850.2805786132812, |
|
"epoch": 0.6636422553467272, |
|
"grad_norm": 99.3387811319855, |
|
"kl": 0.239776611328125, |
|
"learning_rate": 3.7496532012480463e-07, |
|
"loss": 0.0278, |
|
"reward": 0.4910132810473442, |
|
"reward_std": 0.2469508834183216, |
|
"rewards/improved_len_reward_dast": 0.4910132810473442, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 1687.4540405273438, |
|
"epoch": 0.6662346079066753, |
|
"grad_norm": 0.2081083237423023, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 3.7120845533864706e-07, |
|
"loss": 0.0474, |
|
"reward": 0.5329510420560837, |
|
"reward_std": 0.18445927649736404, |
|
"rewards/improved_len_reward_dast": 0.5329510420560837, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 2054.6275024414062, |
|
"epoch": 0.6688269604666235, |
|
"grad_norm": 0.2198626965043421, |
|
"kl": 0.0117645263671875, |
|
"learning_rate": 3.6746633035078723e-07, |
|
"loss": -0.0103, |
|
"reward": 0.38417188823223114, |
|
"reward_std": 0.19621288403868675, |
|
"rewards/improved_len_reward_dast": 0.38417188823223114, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 1646.8213806152344, |
|
"epoch": 0.6714193130265717, |
|
"grad_norm": 0.19620593996030333, |
|
"kl": 0.00890350341796875, |
|
"learning_rate": 3.63739253666684e-07, |
|
"loss": 0.0092, |
|
"reward": 0.5395868346095085, |
|
"reward_std": 0.2524537071585655, |
|
"rewards/improved_len_reward_dast": 0.5395868346095085, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 2210.8162536621094, |
|
"epoch": 0.6740116655865198, |
|
"grad_norm": 0.18772992549784875, |
|
"kl": 0.01015472412109375, |
|
"learning_rate": 3.6002753255119533e-07, |
|
"loss": 0.0418, |
|
"reward": 0.5520248711109161, |
|
"reward_std": 0.25951434671878815, |
|
"rewards/improved_len_reward_dast": 0.5520248711109161, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1637.6224060058594, |
|
"epoch": 0.6766040181464679, |
|
"grad_norm": 0.20139190537407078, |
|
"kl": 0.009979248046875, |
|
"learning_rate": 3.5633147300324706e-07, |
|
"loss": 0.0317, |
|
"reward": 0.47854653000831604, |
|
"reward_std": 0.21838786266744137, |
|
"rewards/improved_len_reward_dast": 0.47854653000831604, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 1974.5560302734375, |
|
"epoch": 0.679196370706416, |
|
"grad_norm": 0.1703155196433375, |
|
"kl": 0.0100555419921875, |
|
"learning_rate": 3.526513797306051e-07, |
|
"loss": -0.0087, |
|
"reward": 0.5659954845905304, |
|
"reward_std": 0.22994915768504143, |
|
"rewards/improved_len_reward_dast": 0.5659954845905304, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 2071.5663146972656, |
|
"epoch": 0.6817887232663642, |
|
"grad_norm": 0.16298809195534278, |
|
"kl": 0.013153076171875, |
|
"learning_rate": 3.489875561247568e-07, |
|
"loss": 0.0145, |
|
"reward": 0.46151311695575714, |
|
"reward_std": 0.2671518959105015, |
|
"rewards/improved_len_reward_dast": 0.46151311695575714, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 1674.2091064453125, |
|
"epoch": 0.6843810758263124, |
|
"grad_norm": 0.1864849140911452, |
|
"kl": 0.00850677490234375, |
|
"learning_rate": 3.453403042358968e-07, |
|
"loss": 0.0185, |
|
"reward": 0.5161371529102325, |
|
"reward_std": 0.24183812364935875, |
|
"rewards/improved_len_reward_dast": 0.5161371529102325, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 1826.3213806152344, |
|
"epoch": 0.6869734283862605, |
|
"grad_norm": 0.18926873912683365, |
|
"kl": 0.0092010498046875, |
|
"learning_rate": 3.417099247480277e-07, |
|
"loss": 0.0219, |
|
"reward": 0.440113328397274, |
|
"reward_std": 0.24182692915201187, |
|
"rewards/improved_len_reward_dast": 0.440113328397274, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 2168.744842529297, |
|
"epoch": 0.6895657809462087, |
|
"grad_norm": 0.18825197599044874, |
|
"kl": 0.0110931396484375, |
|
"learning_rate": 3.3809671695416916e-07, |
|
"loss": 0.0291, |
|
"reward": 0.5052645355463028, |
|
"reward_std": 0.3056667521595955, |
|
"rewards/improved_len_reward_dast": 0.5052645355463028, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 1406.0509643554688, |
|
"epoch": 0.6921581335061568, |
|
"grad_norm": 0.1830588242176062, |
|
"kl": 0.00689697265625, |
|
"learning_rate": 3.345009787316859e-07, |
|
"loss": 0.0028, |
|
"reward": 0.5441867634654045, |
|
"reward_std": 0.21564403921365738, |
|
"rewards/improved_len_reward_dast": 0.5441867634654045, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 1561.0254669189453, |
|
"epoch": 0.694750486066105, |
|
"grad_norm": 0.19864755916409904, |
|
"kl": 0.00759124755859375, |
|
"learning_rate": 3.309230065177289e-07, |
|
"loss": 0.0233, |
|
"reward": 0.6223798245191574, |
|
"reward_std": 0.22251487523317337, |
|
"rewards/improved_len_reward_dast": 0.6223798245191574, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 1677.2652893066406, |
|
"epoch": 0.6973428386260532, |
|
"grad_norm": 0.18729962591317764, |
|
"kl": 0.009307861328125, |
|
"learning_rate": 3.273630952847971e-07, |
|
"loss": 0.0169, |
|
"reward": 0.5602849051356316, |
|
"reward_std": 0.20688385143876076, |
|
"rewards/improved_len_reward_dast": 0.5602849051356316, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 1901.9795837402344, |
|
"epoch": 0.6999351911860013, |
|
"grad_norm": 0.1757643708818276, |
|
"kl": 0.00939178466796875, |
|
"learning_rate": 3.2382153851641996e-07, |
|
"loss": 0.0048, |
|
"reward": 0.4372241795063019, |
|
"reward_std": 0.1733334343880415, |
|
"rewards/improved_len_reward_dast": 0.4372241795063019, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1864.9489440917969, |
|
"epoch": 0.7025275437459495, |
|
"grad_norm": 0.20979118436624683, |
|
"kl": 0.011383056640625, |
|
"learning_rate": 3.202986281829616e-07, |
|
"loss": 0.0047, |
|
"reward": 0.49054908007383347, |
|
"reward_std": 0.2722769007086754, |
|
"rewards/improved_len_reward_dast": 0.49054908007383347, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 1864.551025390625, |
|
"epoch": 0.7051198963058976, |
|
"grad_norm": 0.18787099752483769, |
|
"kl": 0.0099334716796875, |
|
"learning_rate": 3.1679465471755106e-07, |
|
"loss": 0.0112, |
|
"reward": 0.4509451389312744, |
|
"reward_std": 0.2106573022902012, |
|
"rewards/improved_len_reward_dast": 0.4509451389312744, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 2081.4693298339844, |
|
"epoch": 0.7077122488658457, |
|
"grad_norm": 0.17067985532424773, |
|
"kl": 0.013275146484375, |
|
"learning_rate": 3.1330990699213824e-07, |
|
"loss": 0.0178, |
|
"reward": 0.52352125197649, |
|
"reward_std": 0.17880443297326565, |
|
"rewards/improved_len_reward_dast": 0.52352125197649, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 1941.5101928710938, |
|
"epoch": 0.7103046014257939, |
|
"grad_norm": 0.20384047669640917, |
|
"kl": 0.009765625, |
|
"learning_rate": 3.0984467229367885e-07, |
|
"loss": -0.0165, |
|
"reward": 0.47794508188962936, |
|
"reward_std": 0.16116551123559475, |
|
"rewards/improved_len_reward_dast": 0.47794508188962936, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 1740.596923828125, |
|
"epoch": 0.712896953985742, |
|
"grad_norm": 0.16442269270533758, |
|
"kl": 0.0075531005859375, |
|
"learning_rate": 3.063992363004503e-07, |
|
"loss": 0.023, |
|
"reward": 0.6044076532125473, |
|
"reward_std": 0.24043289944529533, |
|
"rewards/improved_len_reward_dast": 0.6044076532125473, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 1916.8571166992188, |
|
"epoch": 0.7154893065456902, |
|
"grad_norm": 0.20495543381215128, |
|
"kl": 0.00904083251953125, |
|
"learning_rate": 3.0297388305850004e-07, |
|
"loss": 0.017, |
|
"reward": 0.46368006244301796, |
|
"reward_std": 0.2539185471832752, |
|
"rewards/improved_len_reward_dast": 0.46368006244301796, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1812.142822265625, |
|
"epoch": 0.7180816591056384, |
|
"grad_norm": 0.25645445924847915, |
|
"kl": 0.010040283203125, |
|
"learning_rate": 2.9956889495822877e-07, |
|
"loss": 0.0104, |
|
"reward": 0.5476516783237457, |
|
"reward_std": 0.24734269082546234, |
|
"rewards/improved_len_reward_dast": 0.5476516783237457, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 1896.540771484375, |
|
"epoch": 0.7206740116655865, |
|
"grad_norm": 0.1648074844235057, |
|
"kl": 0.0090789794921875, |
|
"learning_rate": 2.961845527111091e-07, |
|
"loss": 0.0101, |
|
"reward": 0.4374995678663254, |
|
"reward_std": 0.22751843184232712, |
|
"rewards/improved_len_reward_dast": 0.4374995678663254, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1794.9744567871094, |
|
"epoch": 0.7232663642255347, |
|
"grad_norm": 0.1965304858919259, |
|
"kl": 0.009490966796875, |
|
"learning_rate": 2.9282113532654363e-07, |
|
"loss": 0.0269, |
|
"reward": 0.6033914387226105, |
|
"reward_std": 0.21910444274544716, |
|
"rewards/improved_len_reward_dast": 0.6033914387226105, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 1833.938720703125, |
|
"epoch": 0.7258587167854829, |
|
"grad_norm": 0.22688751850212643, |
|
"kl": 0.0119781494140625, |
|
"learning_rate": 2.894789200888634e-07, |
|
"loss": 0.0314, |
|
"reward": 0.6300860643386841, |
|
"reward_std": 0.20502058789134026, |
|
"rewards/improved_len_reward_dast": 0.6300860643386841, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 1426.5050659179688, |
|
"epoch": 0.728451069345431, |
|
"grad_norm": 0.19933588720750745, |
|
"kl": 0.0085296630859375, |
|
"learning_rate": 2.8615818253446766e-07, |
|
"loss": 0.0176, |
|
"reward": 0.6437288224697113, |
|
"reward_std": 0.18815965950489044, |
|
"rewards/improved_len_reward_dast": 0.6437288224697113, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 1340.8316345214844, |
|
"epoch": 0.7310434219053791, |
|
"grad_norm": 0.17972587406933935, |
|
"kl": 0.00726318359375, |
|
"learning_rate": 2.828591964291093e-07, |
|
"loss": 0.0208, |
|
"reward": 0.4648343026638031, |
|
"reward_std": 0.22194743156433105, |
|
"rewards/improved_len_reward_dast": 0.4648343026638031, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 1601.1275329589844, |
|
"epoch": 0.7336357744653272, |
|
"grad_norm": 0.19127024584768593, |
|
"kl": 0.00716400146484375, |
|
"learning_rate": 2.7958223374532363e-07, |
|
"loss": 0.0235, |
|
"reward": 0.4880499690771103, |
|
"reward_std": 0.27051419019699097, |
|
"rewards/improved_len_reward_dast": 0.4880499690771103, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 1403.9591674804688, |
|
"epoch": 0.7362281270252754, |
|
"grad_norm": 0.17860683081831877, |
|
"kl": 0.007843017578125, |
|
"learning_rate": 2.7632756464000835e-07, |
|
"loss": 0.0191, |
|
"reward": 0.6974282413721085, |
|
"reward_std": 0.17937561869621277, |
|
"rewards/improved_len_reward_dast": 0.6974282413721085, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 2051.2550354003906, |
|
"epoch": 0.7388204795852236, |
|
"grad_norm": 0.19636082014864284, |
|
"kl": 0.0125885009765625, |
|
"learning_rate": 2.730954574321503e-07, |
|
"loss": 0.0296, |
|
"reward": 0.3826203756034374, |
|
"reward_std": 0.2091355100274086, |
|
"rewards/improved_len_reward_dast": 0.3826203756034374, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 1637.9949035644531, |
|
"epoch": 0.7414128321451717, |
|
"grad_norm": 0.19076788482154552, |
|
"kl": 0.00902557373046875, |
|
"learning_rate": 2.698861785807055e-07, |
|
"loss": 0.0357, |
|
"reward": 0.5993083268404007, |
|
"reward_std": 0.26309484988451004, |
|
"rewards/improved_len_reward_dast": 0.5993083268404007, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 1831.4847106933594, |
|
"epoch": 0.7440051847051199, |
|
"grad_norm": 0.17423608191943502, |
|
"kl": 0.00811004638671875, |
|
"learning_rate": 2.6669999266263154e-07, |
|
"loss": -0.009, |
|
"reward": 0.4810323938727379, |
|
"reward_std": 0.2594267800450325, |
|
"rewards/improved_len_reward_dast": 0.4810323938727379, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 1873.801025390625, |
|
"epoch": 0.7465975372650681, |
|
"grad_norm": 0.16706780039675176, |
|
"kl": 0.0091552734375, |
|
"learning_rate": 2.635371623510758e-07, |
|
"loss": 0.013, |
|
"reward": 0.39794730208814144, |
|
"reward_std": 0.2275175377726555, |
|
"rewards/improved_len_reward_dast": 0.39794730208814144, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 1484.2346649169922, |
|
"epoch": 0.7491898898250162, |
|
"grad_norm": 0.21586526660516042, |
|
"kl": 0.0080718994140625, |
|
"learning_rate": 2.6039794839372066e-07, |
|
"loss": -0.0156, |
|
"reward": 0.49782148748636246, |
|
"reward_std": 0.24559944868087769, |
|
"rewards/improved_len_reward_dast": 0.49782148748636246, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 1958.8162536621094, |
|
"epoch": 0.7517822423849644, |
|
"grad_norm": 0.17918596004756887, |
|
"kl": 0.0090789794921875, |
|
"learning_rate": 2.5728260959128614e-07, |
|
"loss": 0.0274, |
|
"reward": 0.5516902059316635, |
|
"reward_std": 0.21921641565859318, |
|
"rewards/improved_len_reward_dast": 0.5516902059316635, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 2349.9642028808594, |
|
"epoch": 0.7543745949449125, |
|
"grad_norm": 0.17462481339673772, |
|
"kl": 0.01324462890625, |
|
"learning_rate": 2.541914027761951e-07, |
|
"loss": 0.038, |
|
"reward": 0.46060309559106827, |
|
"reward_std": 0.24067510664463043, |
|
"rewards/improved_len_reward_dast": 0.46060309559106827, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 1819.5612182617188, |
|
"epoch": 0.7569669475048607, |
|
"grad_norm": 0.20515893392963483, |
|
"kl": 0.0117034912109375, |
|
"learning_rate": 2.511245827913991e-07, |
|
"loss": 0.0134, |
|
"reward": 0.5075127482414246, |
|
"reward_std": 0.2391039952635765, |
|
"rewards/improved_len_reward_dast": 0.5075127482414246, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 1869.4029846191406, |
|
"epoch": 0.7595593000648088, |
|
"grad_norm": 0.18937190499433085, |
|
"kl": 0.00841522216796875, |
|
"learning_rate": 2.4808240246936866e-07, |
|
"loss": 0.0268, |
|
"reward": 0.42616455629467964, |
|
"reward_std": 0.248293437063694, |
|
"rewards/improved_len_reward_dast": 0.42616455629467964, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 1934.89794921875, |
|
"epoch": 0.7621516526247569, |
|
"grad_norm": 0.21697154774685395, |
|
"kl": 0.0114898681640625, |
|
"learning_rate": 2.450651126112504e-07, |
|
"loss": 0.0579, |
|
"reward": 0.558953121304512, |
|
"reward_std": 0.23153522983193398, |
|
"rewards/improved_len_reward_dast": 0.558953121304512, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 1610.1019439697266, |
|
"epoch": 0.7647440051847051, |
|
"grad_norm": 0.2226928196281353, |
|
"kl": 0.00940704345703125, |
|
"learning_rate": 2.4207296196618924e-07, |
|
"loss": 0.0635, |
|
"reward": 0.5272797495126724, |
|
"reward_std": 0.18615226447582245, |
|
"rewards/improved_len_reward_dast": 0.5272797495126724, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 1119.1581420898438, |
|
"epoch": 0.7673363577446533, |
|
"grad_norm": 0.2027474565515649, |
|
"kl": 0.006317138671875, |
|
"learning_rate": 2.3910619721082253e-07, |
|
"loss": 0.0278, |
|
"reward": 0.48764973133802414, |
|
"reward_std": 0.2782805897295475, |
|
"rewards/improved_len_reward_dast": 0.48764973133802414, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 1616.4081420898438, |
|
"epoch": 0.7699287103046014, |
|
"grad_norm": 0.17541198964340385, |
|
"kl": 0.0094146728515625, |
|
"learning_rate": 2.3616506292894282e-07, |
|
"loss": -0.0067, |
|
"reward": 0.5815595760941505, |
|
"reward_std": 0.25435012578964233, |
|
"rewards/improved_len_reward_dast": 0.5815595760941505, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 1853.7652587890625, |
|
"epoch": 0.7725210628645496, |
|
"grad_norm": 0.16574059006766365, |
|
"kl": 0.0091705322265625, |
|
"learning_rate": 2.332498015913344e-07, |
|
"loss": -0.0098, |
|
"reward": 0.5380261987447739, |
|
"reward_std": 0.21175834722816944, |
|
"rewards/improved_len_reward_dast": 0.5380261987447739, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 1436.3621978759766, |
|
"epoch": 0.7751134154244977, |
|
"grad_norm": 0.18476117099686806, |
|
"kl": 0.00868988037109375, |
|
"learning_rate": 2.303606535357843e-07, |
|
"loss": 0.0273, |
|
"reward": 0.6286723613739014, |
|
"reward_std": 0.21005361154675484, |
|
"rewards/improved_len_reward_dast": 0.6286723613739014, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 1812.0917663574219, |
|
"epoch": 0.7777057679844459, |
|
"grad_norm": 0.22027403000618603, |
|
"kl": 0.00989532470703125, |
|
"learning_rate": 2.2749785694726685e-07, |
|
"loss": 0.0398, |
|
"reward": 0.5523130521178246, |
|
"reward_std": 0.2311643809080124, |
|
"rewards/improved_len_reward_dast": 0.5523130521178246, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 1550.091796875, |
|
"epoch": 0.7802981205443941, |
|
"grad_norm": 0.16674263883125018, |
|
"kl": 0.00970458984375, |
|
"learning_rate": 2.2466164783830972e-07, |
|
"loss": 0.0133, |
|
"reward": 0.5227341949939728, |
|
"reward_std": 0.2449796199798584, |
|
"rewards/improved_len_reward_dast": 0.5227341949939728, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 1884.5560607910156, |
|
"epoch": 0.7828904731043422, |
|
"grad_norm": 0.18268480428784603, |
|
"kl": 0.00943756103515625, |
|
"learning_rate": 2.2185226002953483e-07, |
|
"loss": -0.0221, |
|
"reward": 0.5044264793395996, |
|
"reward_std": 0.2946867607533932, |
|
"rewards/improved_len_reward_dast": 0.5044264793395996, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 1944.2856750488281, |
|
"epoch": 0.7854828256642904, |
|
"grad_norm": 0.20936164270865662, |
|
"kl": 0.01123046875, |
|
"learning_rate": 2.1906992513038268e-07, |
|
"loss": 0.0225, |
|
"reward": 0.4654003605246544, |
|
"reward_std": 0.30387868732213974, |
|
"rewards/improved_len_reward_dast": 0.4654003605246544, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 1758.4132385253906, |
|
"epoch": 0.7880751782242384, |
|
"grad_norm": 0.17935190379674407, |
|
"kl": 0.010589599609375, |
|
"learning_rate": 2.1631487252001822e-07, |
|
"loss": 0.0077, |
|
"reward": 0.5262851193547249, |
|
"reward_std": 0.2472703866660595, |
|
"rewards/improved_len_reward_dast": 0.5262851193547249, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 2335.3468627929688, |
|
"epoch": 0.7906675307841866, |
|
"grad_norm": 0.15643277467226446, |
|
"kl": 0.012451171875, |
|
"learning_rate": 2.1358732932842032e-07, |
|
"loss": 0.0207, |
|
"reward": 0.3303733505308628, |
|
"reward_std": 0.25162431970238686, |
|
"rewards/improved_len_reward_dast": 0.3303733505308628, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 1861.2805480957031, |
|
"epoch": 0.7932598833441348, |
|
"grad_norm": 0.1970163252686287, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 2.1088752041765734e-07, |
|
"loss": 0.0398, |
|
"reward": 0.5499422550201416, |
|
"reward_std": 0.2154020182788372, |
|
"rewards/improved_len_reward_dast": 0.5499422550201416, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 1671.3672790527344, |
|
"epoch": 0.7958522359040829, |
|
"grad_norm": 0.19782131445031087, |
|
"kl": 0.00991058349609375, |
|
"learning_rate": 2.0821566836334847e-07, |
|
"loss": 0.0267, |
|
"reward": 0.5274748802185059, |
|
"reward_std": 0.22224271297454834, |
|
"rewards/improved_len_reward_dast": 0.5274748802185059, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 1504.4183044433594, |
|
"epoch": 0.7984445884640311, |
|
"grad_norm": 0.19970611340783334, |
|
"kl": 0.00811767578125, |
|
"learning_rate": 2.0557199343631494e-07, |
|
"loss": 0.01, |
|
"reward": 0.4628491848707199, |
|
"reward_std": 0.26031310856342316, |
|
"rewards/improved_len_reward_dast": 0.4628491848707199, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 1926.744873046875, |
|
"epoch": 0.8010369410239793, |
|
"grad_norm": 0.18231345453995476, |
|
"kl": 0.011356353759765625, |
|
"learning_rate": 2.0295671358442033e-07, |
|
"loss": 0.0294, |
|
"reward": 0.4870244786143303, |
|
"reward_std": 0.27465204894542694, |
|
"rewards/improved_len_reward_dast": 0.4870244786143303, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 1507.6785430908203, |
|
"epoch": 0.8036292935839274, |
|
"grad_norm": 0.19445651546193465, |
|
"kl": 0.008941650390625, |
|
"learning_rate": 2.0037004441460263e-07, |
|
"loss": 0.0194, |
|
"reward": 0.5780586749315262, |
|
"reward_std": 0.20703395083546638, |
|
"rewards/improved_len_reward_dast": 0.5780586749315262, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 1608.9642639160156, |
|
"epoch": 0.8062216461438756, |
|
"grad_norm": 0.16750814215077678, |
|
"kl": 0.0088043212890625, |
|
"learning_rate": 1.9781219917509987e-07, |
|
"loss": 0.0281, |
|
"reward": 0.600249782204628, |
|
"reward_std": 0.19979360327124596, |
|
"rewards/improved_len_reward_dast": 0.600249782204628, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 1835.9234619140625, |
|
"epoch": 0.8088139987038238, |
|
"grad_norm": 0.2213868690216617, |
|
"kl": 0.010528564453125, |
|
"learning_rate": 1.9528338873786882e-07, |
|
"loss": 0.0167, |
|
"reward": 0.459771279245615, |
|
"reward_std": 0.2465382032096386, |
|
"rewards/improved_len_reward_dast": 0.459771279245615, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 1742.1785583496094, |
|
"epoch": 0.8114063512637719, |
|
"grad_norm": 0.1996790847530568, |
|
"kl": 0.0092620849609375, |
|
"learning_rate": 1.9278382158120116e-07, |
|
"loss": 0.028, |
|
"reward": 0.5697463825345039, |
|
"reward_std": 0.27643223106861115, |
|
"rewards/improved_len_reward_dast": 0.5697463825345039, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 1522.6886901855469, |
|
"epoch": 0.81399870382372, |
|
"grad_norm": 0.17972132296325355, |
|
"kl": 0.006275177001953125, |
|
"learning_rate": 1.9031370377253574e-07, |
|
"loss": 0.0038, |
|
"reward": 0.6382875889539719, |
|
"reward_std": 0.2210763283073902, |
|
"rewards/improved_len_reward_dast": 0.6382875889539719, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 1819.9336242675781, |
|
"epoch": 0.8165910563836681, |
|
"grad_norm": 0.23940587104074545, |
|
"kl": 0.010528564453125, |
|
"learning_rate": 1.8787323895147052e-07, |
|
"loss": -0.0003, |
|
"reward": 0.3470681682229042, |
|
"reward_std": 0.29173849523067474, |
|
"rewards/improved_len_reward_dast": 0.3470681682229042, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 1632.4795532226562, |
|
"epoch": 0.8191834089436163, |
|
"grad_norm": 0.1952690422984032, |
|
"kl": 0.0094451904296875, |
|
"learning_rate": 1.8546262831297438e-07, |
|
"loss": -0.007, |
|
"reward": 0.6234361678361893, |
|
"reward_std": 0.2323027402162552, |
|
"rewards/improved_len_reward_dast": 0.6234361678361893, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 1467.7703552246094, |
|
"epoch": 0.8217757615035645, |
|
"grad_norm": 0.18915172276823475, |
|
"kl": 0.0072021484375, |
|
"learning_rate": 1.8308207059079938e-07, |
|
"loss": -0.0214, |
|
"reward": 0.4193726107478142, |
|
"reward_std": 0.28542226925492287, |
|
"rewards/improved_len_reward_dast": 0.4193726107478142, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 1709.9183044433594, |
|
"epoch": 0.8243681140635126, |
|
"grad_norm": 0.18717550388451976, |
|
"kl": 0.0111846923828125, |
|
"learning_rate": 1.8073176204109837e-07, |
|
"loss": 0.0136, |
|
"reward": 0.6079925745725632, |
|
"reward_std": 0.1753884293138981, |
|
"rewards/improved_len_reward_dast": 0.6079925745725632, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 1549.0101623535156, |
|
"epoch": 0.8269604666234608, |
|
"grad_norm": 0.21375644169275265, |
|
"kl": 0.00875091552734375, |
|
"learning_rate": 1.7841189642624428e-07, |
|
"loss": -0.0289, |
|
"reward": 0.48142021149396896, |
|
"reward_std": 0.28746388852596283, |
|
"rewards/improved_len_reward_dast": 0.48142021149396896, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 1784.19384765625, |
|
"epoch": 0.829552819183409, |
|
"grad_norm": 0.17176452506605447, |
|
"kl": 0.0102691650390625, |
|
"learning_rate": 1.7612266499885642e-07, |
|
"loss": 0.0089, |
|
"reward": 0.6086387038230896, |
|
"reward_std": 0.22643940150737762, |
|
"rewards/improved_len_reward_dast": 0.6086387038230896, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 1210.2295684814453, |
|
"epoch": 0.8321451717433571, |
|
"grad_norm": 0.2124277117191842, |
|
"kl": 0.00737762451171875, |
|
"learning_rate": 1.7386425648603354e-07, |
|
"loss": 0.0397, |
|
"reward": 0.6170787662267685, |
|
"reward_std": 0.2108596581965685, |
|
"rewards/improved_len_reward_dast": 0.6170787662267685, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 1488.0816040039062, |
|
"epoch": 0.8347375243033053, |
|
"grad_norm": 0.2120577617498992, |
|
"kl": 0.00836181640625, |
|
"learning_rate": 1.716368570737946e-07, |
|
"loss": 0.0409, |
|
"reward": 0.601994976401329, |
|
"reward_std": 0.2291371487081051, |
|
"rewards/improved_len_reward_dast": 0.601994976401329, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 1990.8111419677734, |
|
"epoch": 0.8373298768632534, |
|
"grad_norm": 0.15844383624247524, |
|
"kl": 0.010498046875, |
|
"learning_rate": 1.6944065039173004e-07, |
|
"loss": -0.0043, |
|
"reward": 0.3935827948153019, |
|
"reward_std": 0.2862498462200165, |
|
"rewards/improved_len_reward_dast": 0.3935827948153019, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 2001.5918273925781, |
|
"epoch": 0.8399222294232016, |
|
"grad_norm": 0.16218776295200982, |
|
"kl": 0.011138916015625, |
|
"learning_rate": 1.672758174978622e-07, |
|
"loss": 0.0059, |
|
"reward": 0.4957594498991966, |
|
"reward_std": 0.22672280296683311, |
|
"rewards/improved_len_reward_dast": 0.4957594498991966, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 1417.3622131347656, |
|
"epoch": 0.8425145819831497, |
|
"grad_norm": 0.2087617301180929, |
|
"kl": 0.00780487060546875, |
|
"learning_rate": 1.6514253686371917e-07, |
|
"loss": 0.0289, |
|
"reward": 0.5871463492512703, |
|
"reward_std": 0.21862176433205605, |
|
"rewards/improved_len_reward_dast": 0.5871463492512703, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 1697.7295837402344, |
|
"epoch": 0.8451069345430978, |
|
"grad_norm": 0.16375311514593216, |
|
"kl": 0.00887298583984375, |
|
"learning_rate": 1.630409843596216e-07, |
|
"loss": 0.0106, |
|
"reward": 0.5537277311086655, |
|
"reward_std": 0.22276470810174942, |
|
"rewards/improved_len_reward_dast": 0.5537277311086655, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 1679.7193298339844, |
|
"epoch": 0.847699287103046, |
|
"grad_norm": 0.17301892622927403, |
|
"kl": 0.0079193115234375, |
|
"learning_rate": 1.609713332401831e-07, |
|
"loss": 0.0271, |
|
"reward": 0.5200591683387756, |
|
"reward_std": 0.24121804535388947, |
|
"rewards/improved_len_reward_dast": 0.5200591683387756, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 1716.8316345214844, |
|
"epoch": 0.8502916396629941, |
|
"grad_norm": 0.21625142796960517, |
|
"kl": 0.007904052734375, |
|
"learning_rate": 1.5893375413002765e-07, |
|
"loss": -0.0081, |
|
"reward": 0.3760797679424286, |
|
"reward_std": 0.3030256852507591, |
|
"rewards/improved_len_reward_dast": 0.3760797679424286, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 2087.734649658203, |
|
"epoch": 0.8528839922229423, |
|
"grad_norm": 0.16651549154328052, |
|
"kl": 0.0118865966796875, |
|
"learning_rate": 1.569284150097226e-07, |
|
"loss": 0.0193, |
|
"reward": 0.526580810546875, |
|
"reward_std": 0.20104971155524254, |
|
"rewards/improved_len_reward_dast": 0.526580810546875, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 1818.8520202636719, |
|
"epoch": 0.8554763447828905, |
|
"grad_norm": 0.18294456751416227, |
|
"kl": 0.0110626220703125, |
|
"learning_rate": 1.5495548120193003e-07, |
|
"loss": -0.0005, |
|
"reward": 0.6227145195007324, |
|
"reward_std": 0.2254919856786728, |
|
"rewards/improved_len_reward_dast": 0.6227145195007324, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 1455.596939086914, |
|
"epoch": 0.8580686973428386, |
|
"grad_norm": 0.18443385066386403, |
|
"kl": 0.006561279296875, |
|
"learning_rate": 1.5301511535777784e-07, |
|
"loss": 0.0069, |
|
"reward": 0.6691017746925354, |
|
"reward_std": 0.2389560304582119, |
|
"rewards/improved_len_reward_dast": 0.6691017746925354, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 1708.9540710449219, |
|
"epoch": 0.8606610499027868, |
|
"grad_norm": 0.22349331481770487, |
|
"kl": 0.01061248779296875, |
|
"learning_rate": 1.5110747744345006e-07, |
|
"loss": 0.0166, |
|
"reward": 0.5234609097242355, |
|
"reward_std": 0.22545504197478294, |
|
"rewards/improved_len_reward_dast": 0.5234609097242355, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 1791.2856750488281, |
|
"epoch": 0.863253402462735, |
|
"grad_norm": 0.20595620886423197, |
|
"kl": 0.0110321044921875, |
|
"learning_rate": 1.4923272472699986e-07, |
|
"loss": 0.0132, |
|
"reward": 0.4878820851445198, |
|
"reward_std": 0.19468558579683304, |
|
"rewards/improved_len_reward_dast": 0.4878820851445198, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 1111.2040557861328, |
|
"epoch": 0.8658457550226831, |
|
"grad_norm": 0.18776891622154324, |
|
"kl": 0.00612640380859375, |
|
"learning_rate": 1.4739101176538274e-07, |
|
"loss": 0.0092, |
|
"reward": 0.4363629147410393, |
|
"reward_std": 0.23635073751211166, |
|
"rewards/improved_len_reward_dast": 0.4363629147410393, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 1884.7958984375, |
|
"epoch": 0.8684381075826313, |
|
"grad_norm": 0.2336826524549116, |
|
"kl": 0.0115203857421875, |
|
"learning_rate": 1.4558249039171639e-07, |
|
"loss": 0.0514, |
|
"reward": 0.5176705569028854, |
|
"reward_std": 0.24876829609274864, |
|
"rewards/improved_len_reward_dast": 0.5176705569028854, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 2040.5509643554688, |
|
"epoch": 0.8710304601425793, |
|
"grad_norm": 0.218416613251664, |
|
"kl": 0.0112457275390625, |
|
"learning_rate": 1.4380730970276195e-07, |
|
"loss": 0.0374, |
|
"reward": 0.4303254596889019, |
|
"reward_std": 0.22433782927691936, |
|
"rewards/improved_len_reward_dast": 0.4303254596889019, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 2323.64794921875, |
|
"epoch": 0.8736228127025275, |
|
"grad_norm": 0.21230824911936896, |
|
"kl": 0.015228271484375, |
|
"learning_rate": 1.420656160466333e-07, |
|
"loss": 0.0156, |
|
"reward": 0.37112269178032875, |
|
"reward_std": 0.22541575506329536, |
|
"rewards/improved_len_reward_dast": 0.37112269178032875, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 1894.1683044433594, |
|
"epoch": 0.8762151652624757, |
|
"grad_norm": 0.229570039046522, |
|
"kl": 0.0124053955078125, |
|
"learning_rate": 1.4035755301073102e-07, |
|
"loss": 0.0045, |
|
"reward": 0.5252515897154808, |
|
"reward_std": 0.2364770919084549, |
|
"rewards/improved_len_reward_dast": 0.5252515897154808, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 1622.8418273925781, |
|
"epoch": 0.8788075178224238, |
|
"grad_norm": 0.1958204168185744, |
|
"kl": 0.01035308837890625, |
|
"learning_rate": 1.386832614099056e-07, |
|
"loss": 0.0081, |
|
"reward": 0.5822550505399704, |
|
"reward_std": 0.21591638028621674, |
|
"rewards/improved_len_reward_dast": 0.5822550505399704, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 1524.19384765625, |
|
"epoch": 0.881399870382372, |
|
"grad_norm": 0.18334226293404582, |
|
"kl": 0.0096435546875, |
|
"learning_rate": 1.3704287927484846e-07, |
|
"loss": 0.0124, |
|
"reward": 0.45736076682806015, |
|
"reward_std": 0.26574842631816864, |
|
"rewards/improved_len_reward_dast": 0.45736076682806015, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 1764.8775329589844, |
|
"epoch": 0.8839922229423202, |
|
"grad_norm": 0.1983658544116603, |
|
"kl": 0.01029205322265625, |
|
"learning_rate": 1.3543654184071186e-07, |
|
"loss": -0.0056, |
|
"reward": 0.5266754031181335, |
|
"reward_std": 0.19769595563411713, |
|
"rewards/improved_len_reward_dast": 0.5266754031181335, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 1573.7908172607422, |
|
"epoch": 0.8865845755022683, |
|
"grad_norm": 0.20470905009725737, |
|
"kl": 0.00878143310546875, |
|
"learning_rate": 1.3386438153596067e-07, |
|
"loss": 0.0079, |
|
"reward": 0.45632604509592056, |
|
"reward_std": 0.27837061509490013, |
|
"rewards/improved_len_reward_dast": 0.45632604509592056, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 1546.9642639160156, |
|
"epoch": 0.8891769280622165, |
|
"grad_norm": 0.195646608080127, |
|
"kl": 0.00881195068359375, |
|
"learning_rate": 1.323265279714543e-07, |
|
"loss": -0.0159, |
|
"reward": 0.47241977229714394, |
|
"reward_std": 0.20328497141599655, |
|
"rewards/improved_len_reward_dast": 0.47241977229714394, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 1638.3571166992188, |
|
"epoch": 0.8917692806221647, |
|
"grad_norm": 0.19710744576953143, |
|
"kl": 0.0110015869140625, |
|
"learning_rate": 1.3082310792976202e-07, |
|
"loss": 0.0262, |
|
"reward": 0.5500081032514572, |
|
"reward_std": 0.2118955608457327, |
|
"rewards/improved_len_reward_dast": 0.5500081032514572, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 1797.3570861816406, |
|
"epoch": 0.8943616331821128, |
|
"grad_norm": 0.23174034375477884, |
|
"kl": 0.01033782958984375, |
|
"learning_rate": 1.293542453547102e-07, |
|
"loss": 0.0376, |
|
"reward": 0.5329776927828789, |
|
"reward_std": 0.26193203777074814, |
|
"rewards/improved_len_reward_dast": 0.5329776927828789, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 1538.1580810546875, |
|
"epoch": 0.8969539857420609, |
|
"grad_norm": 0.1904135528888599, |
|
"kl": 0.00870513916015625, |
|
"learning_rate": 1.279200613411642e-07, |
|
"loss": 0.0264, |
|
"reward": 0.5394655913114548, |
|
"reward_std": 0.24537776410579681, |
|
"rewards/improved_len_reward_dast": 0.5394655913114548, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 2166.448944091797, |
|
"epoch": 0.899546338302009, |
|
"grad_norm": 0.1557827805565501, |
|
"kl": 0.0125579833984375, |
|
"learning_rate": 1.2652067412504605e-07, |
|
"loss": 0.0081, |
|
"reward": 0.4106425456702709, |
|
"reward_std": 0.30277248471975327, |
|
"rewards/improved_len_reward_dast": 0.4106425456702709, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 1541.7806091308594, |
|
"epoch": 0.9021386908619572, |
|
"grad_norm": 0.19932118981267394, |
|
"kl": 0.0081939697265625, |
|
"learning_rate": 1.251561990735859e-07, |
|
"loss": 0.0277, |
|
"reward": 0.5063697546720505, |
|
"reward_std": 0.24998274445533752, |
|
"rewards/improved_len_reward_dast": 0.5063697546720505, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 2111.5306091308594, |
|
"epoch": 0.9047310434219054, |
|
"grad_norm": 0.2417651421099663, |
|
"kl": 0.0139312744140625, |
|
"learning_rate": 1.238267486758117e-07, |
|
"loss": -0.0203, |
|
"reward": 0.27164783608168364, |
|
"reward_std": 0.22704457119107246, |
|
"rewards/improved_len_reward_dast": 0.27164783608168364, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 1849.9336242675781, |
|
"epoch": 0.9073233959818535, |
|
"grad_norm": 0.21790965647021238, |
|
"kl": 0.01129150390625, |
|
"learning_rate": 1.2253243253327504e-07, |
|
"loss": 0.0126, |
|
"reward": 0.4877898320555687, |
|
"reward_std": 0.2690836489200592, |
|
"rewards/improved_len_reward_dast": 0.4877898320555687, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 1613.3826293945312, |
|
"epoch": 0.9099157485418017, |
|
"grad_norm": 0.20972635962388697, |
|
"kl": 0.009246826171875, |
|
"learning_rate": 1.212733573510154e-07, |
|
"loss": 0.026, |
|
"reward": 0.5494348630309105, |
|
"reward_std": 0.2651122659444809, |
|
"rewards/improved_len_reward_dast": 0.5494348630309105, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 1592.3264770507812, |
|
"epoch": 0.9125081011017498, |
|
"grad_norm": 0.21422445239057133, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 1.20049626928764e-07, |
|
"loss": 0.0288, |
|
"reward": 0.5722446367144585, |
|
"reward_std": 0.21782485768198967, |
|
"rewards/improved_len_reward_dast": 0.5722446367144585, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 1722.6377258300781, |
|
"epoch": 0.915100453661698, |
|
"grad_norm": 0.19724359277224385, |
|
"kl": 0.0107574462890625, |
|
"learning_rate": 1.1886134215238539e-07, |
|
"loss": 0.0022, |
|
"reward": 0.6072470247745514, |
|
"reward_std": 0.18419499322772026, |
|
"rewards/improved_len_reward_dast": 0.6072470247745514, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 1321.903060913086, |
|
"epoch": 0.9176928062216462, |
|
"grad_norm": 0.18827903780162886, |
|
"kl": 0.00800323486328125, |
|
"learning_rate": 1.1770860098556122e-07, |
|
"loss": 0.0036, |
|
"reward": 0.6655057221651077, |
|
"reward_std": 0.23036686331033707, |
|
"rewards/improved_len_reward_dast": 0.6655057221651077, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 1697.489730834961, |
|
"epoch": 0.9202851587815943, |
|
"grad_norm": 0.18446628505764348, |
|
"kl": 0.011688232421875, |
|
"learning_rate": 1.1659149846171314e-07, |
|
"loss": -0.0011, |
|
"reward": 0.6077793091535568, |
|
"reward_std": 0.24356402084231377, |
|
"rewards/improved_len_reward_dast": 0.6077793091535568, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 1707.193862915039, |
|
"epoch": 0.9228775113415425, |
|
"grad_norm": 0.21847824906310198, |
|
"kl": 0.0106964111328125, |
|
"learning_rate": 1.1551012667616889e-07, |
|
"loss": 0.0092, |
|
"reward": 0.5661942809820175, |
|
"reward_std": 0.20257538184523582, |
|
"rewards/improved_len_reward_dast": 0.5661942809820175, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 2016.994873046875, |
|
"epoch": 0.9254698639014906, |
|
"grad_norm": 0.18270155740337932, |
|
"kl": 0.0126800537109375, |
|
"learning_rate": 1.1446457477856933e-07, |
|
"loss": 0.0274, |
|
"reward": 0.4170667566359043, |
|
"reward_std": 0.2266511246562004, |
|
"rewards/improved_len_reward_dast": 0.4170667566359043, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 1594.9795532226562, |
|
"epoch": 0.9280622164614387, |
|
"grad_norm": 0.22722827158120512, |
|
"kl": 0.0077362060546875, |
|
"learning_rate": 1.1345492896551908e-07, |
|
"loss": 0.0393, |
|
"reward": 0.553664393723011, |
|
"reward_std": 0.31060051172971725, |
|
"rewards/improved_len_reward_dast": 0.553664393723011, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 1834.0458679199219, |
|
"epoch": 0.9306545690213869, |
|
"grad_norm": 0.17537356402412543, |
|
"kl": 0.00970458984375, |
|
"learning_rate": 1.1248127247348025e-07, |
|
"loss": 0.0211, |
|
"reward": 0.5899290814995766, |
|
"reward_std": 0.23436808586120605, |
|
"rewards/improved_len_reward_dast": 0.5899290814995766, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 1781.6019744873047, |
|
"epoch": 0.933246921581335, |
|
"grad_norm": 0.18436444068948415, |
|
"kl": 0.01174163818359375, |
|
"learning_rate": 1.1154368557191032e-07, |
|
"loss": 0.012, |
|
"reward": 0.3762673009186983, |
|
"reward_std": 0.2291586957871914, |
|
"rewards/improved_len_reward_dast": 0.3762673009186983, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 1417.540771484375, |
|
"epoch": 0.9358392741412832, |
|
"grad_norm": 0.19623835654730662, |
|
"kl": 0.0077362060546875, |
|
"learning_rate": 1.1064224555664489e-07, |
|
"loss": 0.0005, |
|
"reward": 0.4894239827990532, |
|
"reward_std": 0.2442505694925785, |
|
"rewards/improved_len_reward_dast": 0.4894239827990532, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 1666.7703552246094, |
|
"epoch": 0.9384316267012314, |
|
"grad_norm": 0.17894551807778217, |
|
"kl": 0.00939178466796875, |
|
"learning_rate": 1.0977702674352485e-07, |
|
"loss": 0.0265, |
|
"reward": 0.607224777340889, |
|
"reward_std": 0.18787994422018528, |
|
"rewards/improved_len_reward_dast": 0.607224777340889, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 1492.4081420898438, |
|
"epoch": 0.9410239792611795, |
|
"grad_norm": 0.2507376552650098, |
|
"kl": 0.00975799560546875, |
|
"learning_rate": 1.0894810046227007e-07, |
|
"loss": 0.0023, |
|
"reward": 0.5297152251005173, |
|
"reward_std": 0.23747341334819794, |
|
"rewards/improved_len_reward_dast": 0.5297152251005173, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 1323.10205078125, |
|
"epoch": 0.9436163318211277, |
|
"grad_norm": 0.16318792733873802, |
|
"kl": 0.006740570068359375, |
|
"learning_rate": 1.0815553505059864e-07, |
|
"loss": -0.0032, |
|
"reward": 0.6097646132111549, |
|
"reward_std": 0.1873321644961834, |
|
"rewards/improved_len_reward_dast": 0.6097646132111549, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 1844.9795532226562, |
|
"epoch": 0.9462086843810759, |
|
"grad_norm": 0.2063060093669664, |
|
"kl": 0.0108489990234375, |
|
"learning_rate": 1.0739939584859327e-07, |
|
"loss": 0.0645, |
|
"reward": 0.5058535486459732, |
|
"reward_std": 0.22045359015464783, |
|
"rewards/improved_len_reward_dast": 0.5058535486459732, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 2252.352020263672, |
|
"epoch": 0.948801036941024, |
|
"grad_norm": 0.20816769237505958, |
|
"kl": 0.0121002197265625, |
|
"learning_rate": 1.066797451933144e-07, |
|
"loss": 0.0024, |
|
"reward": 0.340947512537241, |
|
"reward_std": 0.3400820717215538, |
|
"rewards/improved_len_reward_dast": 0.340947512537241, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 1632.3060607910156, |
|
"epoch": 0.9513933895009722, |
|
"grad_norm": 0.18459167765340137, |
|
"kl": 0.009857177734375, |
|
"learning_rate": 1.0599664241366108e-07, |
|
"loss": 0.0108, |
|
"reward": 0.5263752043247223, |
|
"reward_std": 0.2795609086751938, |
|
"rewards/improved_len_reward_dast": 0.5263752043247223, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 1941.2754516601562, |
|
"epoch": 0.9539857420609202, |
|
"grad_norm": 0.19606646732184554, |
|
"kl": 0.0090179443359375, |
|
"learning_rate": 1.0535014382547976e-07, |
|
"loss": 0.0404, |
|
"reward": 0.4571000598371029, |
|
"reward_std": 0.33078011497855186, |
|
"rewards/improved_len_reward_dast": 0.4571000598371029, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 1763.602035522461, |
|
"epoch": 0.9565780946208684, |
|
"grad_norm": 0.16415935179120342, |
|
"kl": 0.010005950927734375, |
|
"learning_rate": 1.0474030272692176e-07, |
|
"loss": 0.0194, |
|
"reward": 0.46546463668346405, |
|
"reward_std": 0.2402571141719818, |
|
"rewards/improved_len_reward_dast": 0.46546463668346405, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 1787.3367309570312, |
|
"epoch": 0.9591704471808166, |
|
"grad_norm": 0.18183476066972562, |
|
"kl": 0.0098114013671875, |
|
"learning_rate": 1.0416716939404906e-07, |
|
"loss": 0.0247, |
|
"reward": 0.5943343639373779, |
|
"reward_std": 0.25974351167678833, |
|
"rewards/improved_len_reward_dast": 0.5943343639373779, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 2110.2244567871094, |
|
"epoch": 0.9617627997407647, |
|
"grad_norm": 0.1753149156601552, |
|
"kl": 0.0126953125, |
|
"learning_rate": 1.0363079107668965e-07, |
|
"loss": 0.028, |
|
"reward": 0.42196690291166306, |
|
"reward_std": 0.3020992539823055, |
|
"rewards/improved_len_reward_dast": 0.42196690291166306, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 1868.1173095703125, |
|
"epoch": 0.9643551523007129, |
|
"grad_norm": 0.188995256537594, |
|
"kl": 0.0104827880859375, |
|
"learning_rate": 1.03131211994542e-07, |
|
"loss": -0.0159, |
|
"reward": 0.34801803156733513, |
|
"reward_std": 0.30042145401239395, |
|
"rewards/improved_len_reward_dast": 0.34801803156733513, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 1634.489761352539, |
|
"epoch": 0.9669475048606611, |
|
"grad_norm": 0.16806202813706952, |
|
"kl": 0.0088043212890625, |
|
"learning_rate": 1.0266847333352986e-07, |
|
"loss": 0.0054, |
|
"reward": 0.4557268023490906, |
|
"reward_std": 0.24389904364943504, |
|
"rewards/improved_len_reward_dast": 0.4557268023490906, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 1552.938720703125, |
|
"epoch": 0.9695398574206092, |
|
"grad_norm": 0.18394008200083298, |
|
"kl": 0.0112762451171875, |
|
"learning_rate": 1.022426132424064e-07, |
|
"loss": 0.0133, |
|
"reward": 0.47902625799179077, |
|
"reward_std": 0.2295891009271145, |
|
"rewards/improved_len_reward_dast": 0.47902625799179077, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 1685.0968933105469, |
|
"epoch": 0.9721322099805574, |
|
"grad_norm": 0.1713793991991847, |
|
"kl": 0.00827789306640625, |
|
"learning_rate": 1.0185366682960968e-07, |
|
"loss": 0.0309, |
|
"reward": 0.5218155384063721, |
|
"reward_std": 0.23021429032087326, |
|
"rewards/improved_len_reward_dast": 0.5218155384063721, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 1745.448974609375, |
|
"epoch": 0.9747245625405055, |
|
"grad_norm": 0.17042694708491835, |
|
"kl": 0.01142120361328125, |
|
"learning_rate": 1.015016661603677e-07, |
|
"loss": 0.005, |
|
"reward": 0.4762755334377289, |
|
"reward_std": 0.17237477749586105, |
|
"rewards/improved_len_reward_dast": 0.4762755334377289, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 1860.1581420898438, |
|
"epoch": 0.9773169151004537, |
|
"grad_norm": 0.196878907070725, |
|
"kl": 0.01052093505859375, |
|
"learning_rate": 1.011866402540555e-07, |
|
"loss": 0.0255, |
|
"reward": 0.4570060186088085, |
|
"reward_std": 0.2744992598891258, |
|
"rewards/improved_len_reward_dast": 0.4570060186088085, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 1672.1224365234375, |
|
"epoch": 0.9799092676604018, |
|
"grad_norm": 0.18295738763893107, |
|
"kl": 0.00994110107421875, |
|
"learning_rate": 1.0090861508180229e-07, |
|
"loss": 0.0278, |
|
"reward": 0.5498954951763153, |
|
"reward_std": 0.205208458006382, |
|
"rewards/improved_len_reward_dast": 0.5498954951763153, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 1366.7295379638672, |
|
"epoch": 0.9825016202203499, |
|
"grad_norm": 0.19995975950807063, |
|
"kl": 0.00894927978515625, |
|
"learning_rate": 1.006676135643506e-07, |
|
"loss": 0.0182, |
|
"reward": 0.6602616906166077, |
|
"reward_std": 0.18208089470863342, |
|
"rewards/improved_len_reward_dast": 0.6602616906166077, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 1685.9489440917969, |
|
"epoch": 0.9850939727802981, |
|
"grad_norm": 0.2671604672472017, |
|
"kl": 0.0095367431640625, |
|
"learning_rate": 1.004636555701666e-07, |
|
"loss": 0.0317, |
|
"reward": 0.5187881141901016, |
|
"reward_std": 0.23186353966593742, |
|
"rewards/improved_len_reward_dast": 0.5187881141901016, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 2225.1122436523438, |
|
"epoch": 0.9876863253402463, |
|
"grad_norm": 0.20851974707971935, |
|
"kl": 0.0135955810546875, |
|
"learning_rate": 1.0029675791380211e-07, |
|
"loss": 0.0393, |
|
"reward": 0.5472966581583023, |
|
"reward_std": 0.22922645136713982, |
|
"rewards/improved_len_reward_dast": 0.5472966581583023, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 1713.9336242675781, |
|
"epoch": 0.9902786779001944, |
|
"grad_norm": 0.19490938929994006, |
|
"kl": 0.0126800537109375, |
|
"learning_rate": 1.0016693435450846e-07, |
|
"loss": 0.0017, |
|
"reward": 0.392940990626812, |
|
"reward_std": 0.2464723214507103, |
|
"rewards/improved_len_reward_dast": 0.392940990626812, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 1858.6632385253906, |
|
"epoch": 0.9928710304601426, |
|
"grad_norm": 0.1901299895408558, |
|
"kl": 0.00988006591796875, |
|
"learning_rate": 1.00074195595102e-07, |
|
"loss": 0.0158, |
|
"reward": 0.4689394012093544, |
|
"reward_std": 0.21242598444223404, |
|
"rewards/improved_len_reward_dast": 0.4689394012093544, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 1422.4081573486328, |
|
"epoch": 0.9954633830200907, |
|
"grad_norm": 0.24495186709668992, |
|
"kl": 0.009395599365234375, |
|
"learning_rate": 1.0001854928108199e-07, |
|
"loss": 0.0507, |
|
"reward": 0.5447990372776985, |
|
"reward_std": 0.21043004095554352, |
|
"rewards/improved_len_reward_dast": 0.5447990372776985, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 1555.7091674804688, |
|
"epoch": 0.9980557355800389, |
|
"grad_norm": 0.21071152676099067, |
|
"kl": 0.00982666015625, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0156, |
|
"reward": 0.4512558877468109, |
|
"reward_std": 0.2603309191763401, |
|
"rewards/improved_len_reward_dast": 0.4512558877468109, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9980557355800389, |
|
"step": 385, |
|
"total_flos": 0.0, |
|
"train_loss": 0.01745116442015588, |
|
"train_runtime": 68533.3834, |
|
"train_samples_per_second": 0.158, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 385, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 7, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|