|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9980557355800389, |
|
"eval_steps": 500, |
|
"global_step": 385, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 1870.551025390625, |
|
"epoch": 0.002592352559948153, |
|
"grad_norm": 0.13448497653007507, |
|
"kl": 0.0, |
|
"learning_rate": 2.564102564102564e-08, |
|
"loss": 0.022, |
|
"reward": 0.5185521692037582, |
|
"reward_std": 0.3244118466973305, |
|
"rewards/improved_len_reward_dast": 0.5185521692037582, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 2074.5535583496094, |
|
"epoch": 0.005184705119896306, |
|
"grad_norm": 0.11098560690879822, |
|
"kl": 0.0, |
|
"learning_rate": 5.128205128205128e-08, |
|
"loss": 0.0251, |
|
"reward": 0.32963134348392487, |
|
"reward_std": 0.28946176916360855, |
|
"rewards/improved_len_reward_dast": 0.32963134348392487, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 1990.8341674804688, |
|
"epoch": 0.007777057679844459, |
|
"grad_norm": 0.1158740445971489, |
|
"kl": 0.00014102458953857422, |
|
"learning_rate": 7.692307692307692e-08, |
|
"loss": -0.0241, |
|
"reward": 0.26494530215859413, |
|
"reward_std": 0.3063320405781269, |
|
"rewards/improved_len_reward_dast": 0.26494530215859413, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 2150.165802001953, |
|
"epoch": 0.010369410239792612, |
|
"grad_norm": 0.10702688992023468, |
|
"kl": 0.00012409687042236328, |
|
"learning_rate": 1.0256410256410256e-07, |
|
"loss": -0.0137, |
|
"reward": 0.456451453268528, |
|
"reward_std": 0.3154432289302349, |
|
"rewards/improved_len_reward_dast": 0.456451453268528, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 1848.1632385253906, |
|
"epoch": 0.012961762799740765, |
|
"grad_norm": 0.12357146292924881, |
|
"kl": 0.0001302957534790039, |
|
"learning_rate": 1.2820512820512818e-07, |
|
"loss": 0.028, |
|
"reward": 0.49856673181056976, |
|
"reward_std": 0.2522367388010025, |
|
"rewards/improved_len_reward_dast": 0.49856673181056976, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 2126.336700439453, |
|
"epoch": 0.015554115359688918, |
|
"grad_norm": 0.13506007194519043, |
|
"kl": 0.00012934207916259766, |
|
"learning_rate": 1.5384615384615385e-07, |
|
"loss": 0.0393, |
|
"reward": 0.4235878065228462, |
|
"reward_std": 0.25951137393713, |
|
"rewards/improved_len_reward_dast": 0.4235878065228462, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 1832.3622131347656, |
|
"epoch": 0.01814646791963707, |
|
"grad_norm": 0.1261880099773407, |
|
"kl": 0.00011241436004638672, |
|
"learning_rate": 1.7948717948717948e-07, |
|
"loss": 0.0262, |
|
"reward": 0.3710284195840359, |
|
"reward_std": 0.2790074981749058, |
|
"rewards/improved_len_reward_dast": 0.3710284195840359, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 1735.2193603515625, |
|
"epoch": 0.020738820479585224, |
|
"grad_norm": 0.12955217063426971, |
|
"kl": 0.00010597705841064453, |
|
"learning_rate": 2.0512820512820512e-07, |
|
"loss": 0.0141, |
|
"reward": 0.4706665948033333, |
|
"reward_std": 0.2832951880991459, |
|
"rewards/improved_len_reward_dast": 0.4706665948033333, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 2113.0381774902344, |
|
"epoch": 0.023331173039533377, |
|
"grad_norm": 0.12865294516086578, |
|
"kl": 0.0001271963119506836, |
|
"learning_rate": 2.3076923076923078e-07, |
|
"loss": 0.0338, |
|
"reward": 0.35827554017305374, |
|
"reward_std": 0.29027409106492996, |
|
"rewards/improved_len_reward_dast": 0.35827554017305374, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 1956.9030456542969, |
|
"epoch": 0.02592352559948153, |
|
"grad_norm": 0.13909928500652313, |
|
"kl": 0.00012624263763427734, |
|
"learning_rate": 2.5641025641025636e-07, |
|
"loss": 0.0336, |
|
"reward": 0.3675283007323742, |
|
"reward_std": 0.2691008448600769, |
|
"rewards/improved_len_reward_dast": 0.3675283007323742, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 2262.3163146972656, |
|
"epoch": 0.028515878159429683, |
|
"grad_norm": 0.12856782972812653, |
|
"kl": 0.0001437664031982422, |
|
"learning_rate": 2.8205128205128203e-07, |
|
"loss": 0.017, |
|
"reward": 0.31318413466215134, |
|
"reward_std": 0.3111809715628624, |
|
"rewards/improved_len_reward_dast": 0.31318413466215134, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 2018.7933349609375, |
|
"epoch": 0.031108230719377836, |
|
"grad_norm": 0.12054255604743958, |
|
"kl": 0.0001354217529296875, |
|
"learning_rate": 3.076923076923077e-07, |
|
"loss": -0.0047, |
|
"reward": 0.3627483192831278, |
|
"reward_std": 0.3026025593280792, |
|
"rewards/improved_len_reward_dast": 0.3627483192831278, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 1993.1734008789062, |
|
"epoch": 0.033700583279325985, |
|
"grad_norm": 0.13207760453224182, |
|
"kl": 0.00013363361358642578, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 0.01, |
|
"reward": 0.371895145624876, |
|
"reward_std": 0.26758549362421036, |
|
"rewards/improved_len_reward_dast": 0.371895145624876, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 2101.9234313964844, |
|
"epoch": 0.03629293583927414, |
|
"grad_norm": 0.13171768188476562, |
|
"kl": 0.00011819601058959961, |
|
"learning_rate": 3.5897435897435896e-07, |
|
"loss": 0.0455, |
|
"reward": 0.39824650436639786, |
|
"reward_std": 0.23699114099144936, |
|
"rewards/improved_len_reward_dast": 0.39824650436639786, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 1458.2933349609375, |
|
"epoch": 0.03888528839922229, |
|
"grad_norm": 0.15543967485427856, |
|
"kl": 7.2479248046875e-05, |
|
"learning_rate": 3.8461538461538463e-07, |
|
"loss": 0.0843, |
|
"reward": 0.4232407733798027, |
|
"reward_std": 0.2356618531048298, |
|
"rewards/improved_len_reward_dast": 0.4232407733798027, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 1313.4540405273438, |
|
"epoch": 0.04147764095917045, |
|
"grad_norm": 0.1376647651195526, |
|
"kl": 9.846687316894531e-05, |
|
"learning_rate": 4.1025641025641024e-07, |
|
"loss": -0.0437, |
|
"reward": 0.4939410910010338, |
|
"reward_std": 0.3177715875208378, |
|
"rewards/improved_len_reward_dast": 0.4939410910010338, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 2019.0994873046875, |
|
"epoch": 0.0440699935191186, |
|
"grad_norm": 0.11130757629871368, |
|
"kl": 0.00011444091796875, |
|
"learning_rate": 4.358974358974359e-07, |
|
"loss": 0.0027, |
|
"reward": 0.4330388903617859, |
|
"reward_std": 0.37679746001958847, |
|
"rewards/improved_len_reward_dast": 0.4330388903617859, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 1429.14794921875, |
|
"epoch": 0.046662346079066754, |
|
"grad_norm": 0.1924666464328766, |
|
"kl": 8.881092071533203e-05, |
|
"learning_rate": 4.6153846153846156e-07, |
|
"loss": 0.0804, |
|
"reward": 0.30921216681599617, |
|
"reward_std": 0.2913207747042179, |
|
"rewards/improved_len_reward_dast": 0.30921216681599617, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 1738.7474060058594, |
|
"epoch": 0.0492546986390149, |
|
"grad_norm": 0.13304243981838226, |
|
"kl": 0.0001150369644165039, |
|
"learning_rate": 4.871794871794871e-07, |
|
"loss": 0.0111, |
|
"reward": 0.38398153707385063, |
|
"reward_std": 0.34359200298786163, |
|
"rewards/improved_len_reward_dast": 0.38398153707385063, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 1761.8928527832031, |
|
"epoch": 0.05184705119896306, |
|
"grad_norm": 0.13784411549568176, |
|
"kl": 0.00011897087097167969, |
|
"learning_rate": 5.128205128205127e-07, |
|
"loss": -0.0009, |
|
"reward": 0.31885702908039093, |
|
"reward_std": 0.3634636849164963, |
|
"rewards/improved_len_reward_dast": 0.31885702908039093, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 2171.124969482422, |
|
"epoch": 0.05443940375891121, |
|
"grad_norm": 0.13725849986076355, |
|
"kl": 0.00013327598571777344, |
|
"learning_rate": 5.384615384615384e-07, |
|
"loss": 0.003, |
|
"reward": 0.28723688423633575, |
|
"reward_std": 0.3837554454803467, |
|
"rewards/improved_len_reward_dast": 0.28723688423633575, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 1959.369873046875, |
|
"epoch": 0.057031756318859365, |
|
"grad_norm": 0.14588049054145813, |
|
"kl": 0.00011110305786132812, |
|
"learning_rate": 5.641025641025641e-07, |
|
"loss": 0.0188, |
|
"reward": 0.39568372815847397, |
|
"reward_std": 0.3404585272073746, |
|
"rewards/improved_len_reward_dast": 0.39568372815847397, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 1891.4412536621094, |
|
"epoch": 0.059624108878807515, |
|
"grad_norm": 0.11357180029153824, |
|
"kl": 0.00010156631469726562, |
|
"learning_rate": 5.897435897435898e-07, |
|
"loss": 0.0197, |
|
"reward": 0.49445799738168716, |
|
"reward_std": 0.2365701049566269, |
|
"rewards/improved_len_reward_dast": 0.49445799738168716, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 1616.0025482177734, |
|
"epoch": 0.06221646143875567, |
|
"grad_norm": 0.12517189979553223, |
|
"kl": 0.0001264810562133789, |
|
"learning_rate": 6.153846153846154e-07, |
|
"loss": 0.0208, |
|
"reward": 0.38593798875808716, |
|
"reward_std": 0.26876550912857056, |
|
"rewards/improved_len_reward_dast": 0.38593798875808716, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 2217.3290405273438, |
|
"epoch": 0.06480881399870382, |
|
"grad_norm": 0.12683141231536865, |
|
"kl": 0.00015163421630859375, |
|
"learning_rate": 6.410256410256411e-07, |
|
"loss": 0.0211, |
|
"reward": 0.35348474979400635, |
|
"reward_std": 0.2698053792119026, |
|
"rewards/improved_len_reward_dast": 0.35348474979400635, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 1800.8596801757812, |
|
"epoch": 0.06740116655865197, |
|
"grad_norm": 0.13101568818092346, |
|
"kl": 0.00011932849884033203, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 0.0015, |
|
"reward": 0.40977882593870163, |
|
"reward_std": 0.31720418483018875, |
|
"rewards/improved_len_reward_dast": 0.40977882593870163, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 1796.7372131347656, |
|
"epoch": 0.06999351911860013, |
|
"grad_norm": 0.13930906355381012, |
|
"kl": 0.0001131296157836914, |
|
"learning_rate": 6.923076923076922e-07, |
|
"loss": 0.0671, |
|
"reward": 0.4471106305718422, |
|
"reward_std": 0.2687300704419613, |
|
"rewards/improved_len_reward_dast": 0.4471106305718422, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 1676.813720703125, |
|
"epoch": 0.07258587167854828, |
|
"grad_norm": 0.14517556130886078, |
|
"kl": 0.0001157522201538086, |
|
"learning_rate": 7.179487179487179e-07, |
|
"loss": 0.0433, |
|
"reward": 0.4442668706178665, |
|
"reward_std": 0.23423199355602264, |
|
"rewards/improved_len_reward_dast": 0.4442668706178665, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 2002.9489440917969, |
|
"epoch": 0.07517822423849643, |
|
"grad_norm": 0.12484736740589142, |
|
"kl": 0.0001323223114013672, |
|
"learning_rate": 7.435897435897435e-07, |
|
"loss": -0.0168, |
|
"reward": 0.4897717013955116, |
|
"reward_std": 0.24505353346467018, |
|
"rewards/improved_len_reward_dast": 0.4897717013955116, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 1715.0433349609375, |
|
"epoch": 0.07777057679844458, |
|
"grad_norm": 0.11366493999958038, |
|
"kl": 8.857250213623047e-05, |
|
"learning_rate": 7.692307692307693e-07, |
|
"loss": -0.0119, |
|
"reward": 0.24165286868810654, |
|
"reward_std": 0.2886221148073673, |
|
"rewards/improved_len_reward_dast": 0.24165286868810654, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2232.938751220703, |
|
"epoch": 0.08036292935839275, |
|
"grad_norm": 0.10916973650455475, |
|
"kl": 0.00017595291137695312, |
|
"learning_rate": 7.948717948717948e-07, |
|
"loss": 0.0336, |
|
"reward": 0.5083014816045761, |
|
"reward_std": 0.25778181850910187, |
|
"rewards/improved_len_reward_dast": 0.5083014816045761, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 1905.7474060058594, |
|
"epoch": 0.0829552819183409, |
|
"grad_norm": 0.15653526782989502, |
|
"kl": 0.00012862682342529297, |
|
"learning_rate": 8.205128205128205e-07, |
|
"loss": -0.0477, |
|
"reward": 0.2919162670150399, |
|
"reward_std": 0.3391455188393593, |
|
"rewards/improved_len_reward_dast": 0.2919162670150399, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 1948.9158325195312, |
|
"epoch": 0.08554763447828904, |
|
"grad_norm": 0.1198260486125946, |
|
"kl": 0.00013387203216552734, |
|
"learning_rate": 8.461538461538461e-07, |
|
"loss": 0.0302, |
|
"reward": 0.4744948521256447, |
|
"reward_std": 0.31309082731604576, |
|
"rewards/improved_len_reward_dast": 0.4744948521256447, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 1995.9464111328125, |
|
"epoch": 0.0881399870382372, |
|
"grad_norm": 0.11656484007835388, |
|
"kl": 0.0001513957977294922, |
|
"learning_rate": 8.717948717948718e-07, |
|
"loss": -0.012, |
|
"reward": 0.30143800005316734, |
|
"reward_std": 0.3431224897503853, |
|
"rewards/improved_len_reward_dast": 0.30143800005316734, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 2503.1912536621094, |
|
"epoch": 0.09073233959818536, |
|
"grad_norm": 0.12246429920196533, |
|
"kl": 0.00017714500427246094, |
|
"learning_rate": 8.974358974358974e-07, |
|
"loss": -0.0444, |
|
"reward": 0.19646108895540237, |
|
"reward_std": 0.2645679712295532, |
|
"rewards/improved_len_reward_dast": 0.19646108895540237, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 2238.3545532226562, |
|
"epoch": 0.09332469215813351, |
|
"grad_norm": 0.14648236334323883, |
|
"kl": 0.00016880035400390625, |
|
"learning_rate": 9.230769230769231e-07, |
|
"loss": 0.0673, |
|
"reward": 0.4503837898373604, |
|
"reward_std": 0.21853860095143318, |
|
"rewards/improved_len_reward_dast": 0.4503837898373604, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 2044.1734313964844, |
|
"epoch": 0.09591704471808166, |
|
"grad_norm": 0.12917490303516388, |
|
"kl": 0.00019884109497070312, |
|
"learning_rate": 9.487179487179486e-07, |
|
"loss": 0.0635, |
|
"reward": 0.47461262345314026, |
|
"reward_std": 0.2628549002110958, |
|
"rewards/improved_len_reward_dast": 0.47461262345314026, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 1669.1275329589844, |
|
"epoch": 0.0985093972780298, |
|
"grad_norm": 0.20408552885055542, |
|
"kl": 0.00013911724090576172, |
|
"learning_rate": 9.743589743589742e-07, |
|
"loss": 0.0726, |
|
"reward": 0.4879928305745125, |
|
"reward_std": 0.2501045912504196, |
|
"rewards/improved_len_reward_dast": 0.4879928305745125, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 2456.553466796875, |
|
"epoch": 0.10110174983797797, |
|
"grad_norm": 0.14151060581207275, |
|
"kl": 0.00017547607421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0097, |
|
"reward": 0.19765825755894184, |
|
"reward_std": 0.32448211312294006, |
|
"rewards/improved_len_reward_dast": 0.19765825755894184, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 2550.58154296875, |
|
"epoch": 0.10369410239792612, |
|
"grad_norm": 0.12014975398778915, |
|
"kl": 0.00021982192993164062, |
|
"learning_rate": 9.99981450718918e-07, |
|
"loss": 0.0362, |
|
"reward": 0.24847418442368507, |
|
"reward_std": 0.2784438841044903, |
|
"rewards/improved_len_reward_dast": 0.24847418442368507, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 1850.7372131347656, |
|
"epoch": 0.10628645495787427, |
|
"grad_norm": 0.13483257591724396, |
|
"kl": 0.00026726722717285156, |
|
"learning_rate": 9.99925804404898e-07, |
|
"loss": -0.0137, |
|
"reward": 0.23939451575279236, |
|
"reward_std": 0.3702044114470482, |
|
"rewards/improved_len_reward_dast": 0.23939451575279236, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 2014.2091674804688, |
|
"epoch": 0.10887880751782242, |
|
"grad_norm": 0.12142825126647949, |
|
"kl": 0.00023293495178222656, |
|
"learning_rate": 9.998330656454915e-07, |
|
"loss": 0.0555, |
|
"reward": 0.48714711517095566, |
|
"reward_std": 0.2657182738184929, |
|
"rewards/improved_len_reward_dast": 0.48714711517095566, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 1960.7218933105469, |
|
"epoch": 0.11147116007777058, |
|
"grad_norm": 0.1477634757757187, |
|
"kl": 0.0003228187561035156, |
|
"learning_rate": 9.99703242086198e-07, |
|
"loss": 0.038, |
|
"reward": 0.4138510562479496, |
|
"reward_std": 0.2605874165892601, |
|
"rewards/improved_len_reward_dast": 0.4138510562479496, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 1840.2117309570312, |
|
"epoch": 0.11406351263771873, |
|
"grad_norm": 0.13796693086624146, |
|
"kl": 0.00024700164794921875, |
|
"learning_rate": 9.995363444298333e-07, |
|
"loss": 0.019, |
|
"reward": 0.5052988603711128, |
|
"reward_std": 0.26378944143652916, |
|
"rewards/improved_len_reward_dast": 0.5052988603711128, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 2340.3392639160156, |
|
"epoch": 0.11665586519766688, |
|
"grad_norm": 0.11008527874946594, |
|
"kl": 0.00039124488830566406, |
|
"learning_rate": 9.993323864356492e-07, |
|
"loss": 0.0052, |
|
"reward": 0.22386901453137398, |
|
"reward_std": 0.31205885112285614, |
|
"rewards/improved_len_reward_dast": 0.22386901453137398, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 2944.1682739257812, |
|
"epoch": 0.11924821775761503, |
|
"grad_norm": 0.0939781591296196, |
|
"kl": 0.0002903938293457031, |
|
"learning_rate": 9.990913849181977e-07, |
|
"loss": 0.0107, |
|
"reward": 0.27933235839009285, |
|
"reward_std": 0.28421058878302574, |
|
"rewards/improved_len_reward_dast": 0.27933235839009285, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 1859.7703704833984, |
|
"epoch": 0.1218405703175632, |
|
"grad_norm": 0.15788155794143677, |
|
"kl": 0.0004353523254394531, |
|
"learning_rate": 9.988133597459444e-07, |
|
"loss": 0.0434, |
|
"reward": 0.387714684009552, |
|
"reward_std": 0.2896231710910797, |
|
"rewards/improved_len_reward_dast": 0.387714684009552, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 1945.9565734863281, |
|
"epoch": 0.12443292287751134, |
|
"grad_norm": 0.2036711722612381, |
|
"kl": 0.0004405975341796875, |
|
"learning_rate": 9.984983338396323e-07, |
|
"loss": 0.0847, |
|
"reward": 0.4099316783249378, |
|
"reward_std": 0.23737533017992973, |
|
"rewards/improved_len_reward_dast": 0.4099316783249378, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 1638.6071166992188, |
|
"epoch": 0.1270252754374595, |
|
"grad_norm": 0.15458206832408905, |
|
"kl": 0.00054168701171875, |
|
"learning_rate": 9.981463331703903e-07, |
|
"loss": 0.0575, |
|
"reward": 0.5117293447256088, |
|
"reward_std": 0.24939828738570213, |
|
"rewards/improved_len_reward_dast": 0.5117293447256088, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 2023.471923828125, |
|
"epoch": 0.12961762799740764, |
|
"grad_norm": 0.14058855175971985, |
|
"kl": 0.0006809234619140625, |
|
"learning_rate": 9.977573867575937e-07, |
|
"loss": -0.0192, |
|
"reward": 0.35891789197921753, |
|
"reward_std": 0.3240862749516964, |
|
"rewards/improved_len_reward_dast": 0.35891789197921753, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 2265.3264770507812, |
|
"epoch": 0.1322099805573558, |
|
"grad_norm": 0.12996900081634521, |
|
"kl": 0.0007781982421875, |
|
"learning_rate": 9.9733152666647e-07, |
|
"loss": -0.0032, |
|
"reward": 0.420456662774086, |
|
"reward_std": 0.30560050904750824, |
|
"rewards/improved_len_reward_dast": 0.420456662774086, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 2654.7677612304688, |
|
"epoch": 0.13480233311730394, |
|
"grad_norm": 0.1508190631866455, |
|
"kl": 0.0006885528564453125, |
|
"learning_rate": 9.968687880054579e-07, |
|
"loss": 0.0619, |
|
"reward": 0.46665582805871964, |
|
"reward_std": 0.26216986030340195, |
|
"rewards/improved_len_reward_dast": 0.46665582805871964, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 1829.56884765625, |
|
"epoch": 0.1373946856772521, |
|
"grad_norm": 0.15750233829021454, |
|
"kl": 0.001232147216796875, |
|
"learning_rate": 9.963692089233104e-07, |
|
"loss": 0.0603, |
|
"reward": 0.35224995017051697, |
|
"reward_std": 0.27216843515634537, |
|
"rewards/improved_len_reward_dast": 0.35224995017051697, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 1852.9693298339844, |
|
"epoch": 0.13998703823720027, |
|
"grad_norm": 0.14929921925067902, |
|
"kl": 0.0011272430419921875, |
|
"learning_rate": 9.958328306059508e-07, |
|
"loss": -0.0025, |
|
"reward": 0.391354002058506, |
|
"reward_std": 0.24403201416134834, |
|
"rewards/improved_len_reward_dast": 0.391354002058506, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 2053.5662536621094, |
|
"epoch": 0.1425793907971484, |
|
"grad_norm": 0.12911133468151093, |
|
"kl": 0.0014352798461914062, |
|
"learning_rate": 9.952596972730782e-07, |
|
"loss": -0.0058, |
|
"reward": 0.3265633024275303, |
|
"reward_std": 0.2745610848069191, |
|
"rewards/improved_len_reward_dast": 0.3265633024275303, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 2230.5382690429688, |
|
"epoch": 0.14517174335709657, |
|
"grad_norm": 0.13832230865955353, |
|
"kl": 0.001399993896484375, |
|
"learning_rate": 9.946498561745201e-07, |
|
"loss": 0.0347, |
|
"reward": 0.4754156991839409, |
|
"reward_std": 0.27349359542131424, |
|
"rewards/improved_len_reward_dast": 0.4754156991839409, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 2175.3060607910156, |
|
"epoch": 0.14776409591704473, |
|
"grad_norm": 0.13634833693504333, |
|
"kl": 0.0013599395751953125, |
|
"learning_rate": 9.94003357586339e-07, |
|
"loss": 0.011, |
|
"reward": 0.535503476858139, |
|
"reward_std": 0.25894078612327576, |
|
"rewards/improved_len_reward_dast": 0.535503476858139, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 2355.7091369628906, |
|
"epoch": 0.15035644847699287, |
|
"grad_norm": 0.11235704272985458, |
|
"kl": 0.0012407302856445312, |
|
"learning_rate": 9.933202548066855e-07, |
|
"loss": 0.0133, |
|
"reward": 0.4234941601753235, |
|
"reward_std": 0.24918782338500023, |
|
"rewards/improved_len_reward_dast": 0.4234941601753235, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 2449.9718627929688, |
|
"epoch": 0.15294880103694103, |
|
"grad_norm": 0.11569388210773468, |
|
"kl": 0.001361846923828125, |
|
"learning_rate": 9.926006041514068e-07, |
|
"loss": 0.0195, |
|
"reward": 0.42598315328359604, |
|
"reward_std": 0.3036581464111805, |
|
"rewards/improved_len_reward_dast": 0.42598315328359604, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 2177.0101623535156, |
|
"epoch": 0.15554115359688916, |
|
"grad_norm": 0.15036629140377045, |
|
"kl": 0.002197265625, |
|
"learning_rate": 9.918444649494012e-07, |
|
"loss": 0.0499, |
|
"reward": 0.47116725891828537, |
|
"reward_std": 0.2249010019004345, |
|
"rewards/improved_len_reward_dast": 0.47116725891828537, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 2315.211669921875, |
|
"epoch": 0.15813350615683733, |
|
"grad_norm": 0.12630639970302582, |
|
"kl": 0.002147674560546875, |
|
"learning_rate": 9.9105189953773e-07, |
|
"loss": 0.0155, |
|
"reward": 0.4634978622198105, |
|
"reward_std": 0.3047446385025978, |
|
"rewards/improved_len_reward_dast": 0.4634978622198105, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 2431.4208374023438, |
|
"epoch": 0.1607258587167855, |
|
"grad_norm": 0.13092079758644104, |
|
"kl": 0.0019016265869140625, |
|
"learning_rate": 9.90222973256475e-07, |
|
"loss": 0.029, |
|
"reward": 0.5218361169099808, |
|
"reward_std": 0.3062875270843506, |
|
"rewards/improved_len_reward_dast": 0.5218361169099808, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 2737.2295532226562, |
|
"epoch": 0.16331821127673363, |
|
"grad_norm": 0.12846292555332184, |
|
"kl": 0.001842498779296875, |
|
"learning_rate": 9.89357754443355e-07, |
|
"loss": 0.0341, |
|
"reward": 0.3177746832370758, |
|
"reward_std": 0.2347201406955719, |
|
"rewards/improved_len_reward_dast": 0.3177746832370758, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 2820.30859375, |
|
"epoch": 0.1659105638366818, |
|
"grad_norm": 0.11326766014099121, |
|
"kl": 0.002269744873046875, |
|
"learning_rate": 9.884563144280897e-07, |
|
"loss": 0.0303, |
|
"reward": 0.40527529269456863, |
|
"reward_std": 0.27593884617090225, |
|
"rewards/improved_len_reward_dast": 0.40527529269456863, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 2124.1249389648438, |
|
"epoch": 0.16850291639662995, |
|
"grad_norm": 0.16672682762145996, |
|
"kl": 0.002834320068359375, |
|
"learning_rate": 9.875187275265198e-07, |
|
"loss": 0.0854, |
|
"reward": 0.44389794766902924, |
|
"reward_std": 0.239344272762537, |
|
"rewards/improved_len_reward_dast": 0.44389794766902924, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 2648.7473754882812, |
|
"epoch": 0.1710952689565781, |
|
"grad_norm": 0.12017077952623367, |
|
"kl": 0.002216339111328125, |
|
"learning_rate": 9.865450710344807e-07, |
|
"loss": 0.0354, |
|
"reward": 0.34619488939642906, |
|
"reward_std": 0.23249895870685577, |
|
"rewards/improved_len_reward_dast": 0.34619488939642906, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 2351.4132385253906, |
|
"epoch": 0.17368762151652625, |
|
"grad_norm": 0.13515809178352356, |
|
"kl": 0.00341796875, |
|
"learning_rate": 9.855354252214307e-07, |
|
"loss": 0.0206, |
|
"reward": 0.4116981029510498, |
|
"reward_std": 0.28415245935320854, |
|
"rewards/improved_len_reward_dast": 0.4116981029510498, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 1852.6810302734375, |
|
"epoch": 0.1762799740764744, |
|
"grad_norm": 0.17753329873085022, |
|
"kl": 0.003093719482421875, |
|
"learning_rate": 9.844898733238311e-07, |
|
"loss": 0.0359, |
|
"reward": 0.5103119313716888, |
|
"reward_std": 0.28220145776867867, |
|
"rewards/improved_len_reward_dast": 0.5103119313716888, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 2115.632568359375, |
|
"epoch": 0.17887232663642255, |
|
"grad_norm": 0.14785140752792358, |
|
"kl": 0.003803253173828125, |
|
"learning_rate": 9.83408501538287e-07, |
|
"loss": 0.0448, |
|
"reward": 0.4045008569955826, |
|
"reward_std": 0.27792026475071907, |
|
"rewards/improved_len_reward_dast": 0.4045008569955826, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 1836.5025329589844, |
|
"epoch": 0.18146467919637072, |
|
"grad_norm": 0.13213248550891876, |
|
"kl": 0.003261566162109375, |
|
"learning_rate": 9.822913990144387e-07, |
|
"loss": 0.0005, |
|
"reward": 0.3931305408477783, |
|
"reward_std": 0.30834557116031647, |
|
"rewards/improved_len_reward_dast": 0.3931305408477783, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 2272.7142639160156, |
|
"epoch": 0.18405703175631885, |
|
"grad_norm": 0.18136066198349, |
|
"kl": 0.0032138824462890625, |
|
"learning_rate": 9.811386578476146e-07, |
|
"loss": 0.0901, |
|
"reward": 0.5118075683712959, |
|
"reward_std": 0.24460211768746376, |
|
"rewards/improved_len_reward_dast": 0.5118075683712959, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 2046.1580810546875, |
|
"epoch": 0.18664938431626701, |
|
"grad_norm": 0.16394267976284027, |
|
"kl": 0.003330230712890625, |
|
"learning_rate": 9.79950373071236e-07, |
|
"loss": 0.0388, |
|
"reward": 0.476028174161911, |
|
"reward_std": 0.2648630440235138, |
|
"rewards/improved_len_reward_dast": 0.476028174161911, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 2163.252471923828, |
|
"epoch": 0.18924173687621518, |
|
"grad_norm": 0.14138321578502655, |
|
"kl": 0.004444122314453125, |
|
"learning_rate": 9.787266426489845e-07, |
|
"loss": 0.0132, |
|
"reward": 0.441867433488369, |
|
"reward_std": 0.2500956766307354, |
|
"rewards/improved_len_reward_dast": 0.441867433488369, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 2289.507598876953, |
|
"epoch": 0.1918340894361633, |
|
"grad_norm": 0.1288045346736908, |
|
"kl": 0.00394439697265625, |
|
"learning_rate": 9.77467567466725e-07, |
|
"loss": 0.005, |
|
"reward": 0.4050723984837532, |
|
"reward_std": 0.3099602647125721, |
|
"rewards/improved_len_reward_dast": 0.4050723984837532, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 2383.9718627929688, |
|
"epoch": 0.19442644199611148, |
|
"grad_norm": 0.14801673591136932, |
|
"kl": 0.004150390625, |
|
"learning_rate": 9.761732513241882e-07, |
|
"loss": 0.052, |
|
"reward": 0.4946385696530342, |
|
"reward_std": 0.2349606677889824, |
|
"rewards/improved_len_reward_dast": 0.4946385696530342, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 2045.8979187011719, |
|
"epoch": 0.1970187945560596, |
|
"grad_norm": 0.17007729411125183, |
|
"kl": 0.0055694580078125, |
|
"learning_rate": 9.748438009264142e-07, |
|
"loss": 0.0672, |
|
"reward": 0.5539986491203308, |
|
"reward_std": 0.20796510577201843, |
|
"rewards/improved_len_reward_dast": 0.5539986491203308, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 2371.7626953125, |
|
"epoch": 0.19961114711600778, |
|
"grad_norm": 0.12920302152633667, |
|
"kl": 0.00461578369140625, |
|
"learning_rate": 9.734793258749538e-07, |
|
"loss": 0.0104, |
|
"reward": 0.5021207295358181, |
|
"reward_std": 0.2304290495812893, |
|
"rewards/improved_len_reward_dast": 0.5021207295358181, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 2612.2882080078125, |
|
"epoch": 0.20220349967595594, |
|
"grad_norm": 0.12678897380828857, |
|
"kl": 0.00510406494140625, |
|
"learning_rate": 9.720799386588358e-07, |
|
"loss": 0.0317, |
|
"reward": 0.4242451824247837, |
|
"reward_std": 0.2596823424100876, |
|
"rewards/improved_len_reward_dast": 0.4242451824247837, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 2259.831573486328, |
|
"epoch": 0.20479585223590407, |
|
"grad_norm": 0.1500682830810547, |
|
"kl": 0.00511932373046875, |
|
"learning_rate": 9.706457546452898e-07, |
|
"loss": -0.0061, |
|
"reward": 0.42074430361390114, |
|
"reward_std": 0.24277805909514427, |
|
"rewards/improved_len_reward_dast": 0.42074430361390114, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 2893.040771484375, |
|
"epoch": 0.20738820479585224, |
|
"grad_norm": 0.10854899138212204, |
|
"kl": 0.004856109619140625, |
|
"learning_rate": 9.691768920702379e-07, |
|
"loss": -0.0021, |
|
"reward": 0.33654000610113144, |
|
"reward_std": 0.1983367819339037, |
|
"rewards/improved_len_reward_dast": 0.33654000610113144, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 2305.7474365234375, |
|
"epoch": 0.2099805573558004, |
|
"grad_norm": 0.12965475022792816, |
|
"kl": 0.00487518310546875, |
|
"learning_rate": 9.676734720285456e-07, |
|
"loss": -0.0154, |
|
"reward": 0.4322432279586792, |
|
"reward_std": 0.2519207112491131, |
|
"rewards/improved_len_reward_dast": 0.4322432279586792, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 2196.4744567871094, |
|
"epoch": 0.21257290991574854, |
|
"grad_norm": 0.23942671716213226, |
|
"kl": 0.01129150390625, |
|
"learning_rate": 9.661356184640394e-07, |
|
"loss": 0.0429, |
|
"reward": 0.4929245412349701, |
|
"reward_std": 0.21502425894141197, |
|
"rewards/improved_len_reward_dast": 0.4929245412349701, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 2185.619873046875, |
|
"epoch": 0.2151652624756967, |
|
"grad_norm": 0.1231599673628807, |
|
"kl": 0.00595855712890625, |
|
"learning_rate": 9.64563458159288e-07, |
|
"loss": 0.0282, |
|
"reward": 0.4786108732223511, |
|
"reward_std": 0.28226276487112045, |
|
"rewards/improved_len_reward_dast": 0.4786108732223511, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 2437.6173400878906, |
|
"epoch": 0.21775761503564484, |
|
"grad_norm": 0.14213241636753082, |
|
"kl": 0.005523681640625, |
|
"learning_rate": 9.629571207251515e-07, |
|
"loss": 0.0442, |
|
"reward": 0.48524054139852524, |
|
"reward_std": 0.22880307212471962, |
|
"rewards/improved_len_reward_dast": 0.48524054139852524, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 2338.068817138672, |
|
"epoch": 0.220349967595593, |
|
"grad_norm": 0.1439816653728485, |
|
"kl": 0.0064239501953125, |
|
"learning_rate": 9.613167385900944e-07, |
|
"loss": 0.0381, |
|
"reward": 0.32151066698133945, |
|
"reward_std": 0.2240244559943676, |
|
"rewards/improved_len_reward_dast": 0.32151066698133945, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 2180.928497314453, |
|
"epoch": 0.22294232015554116, |
|
"grad_norm": 0.1326073408126831, |
|
"kl": 0.00629425048828125, |
|
"learning_rate": 9.59642446989269e-07, |
|
"loss": -0.0225, |
|
"reward": 0.46004387736320496, |
|
"reward_std": 0.29043208435177803, |
|
"rewards/improved_len_reward_dast": 0.46004387736320496, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 2378.7499389648438, |
|
"epoch": 0.2255346727154893, |
|
"grad_norm": 0.13580094277858734, |
|
"kl": 0.00611114501953125, |
|
"learning_rate": 9.579343839533668e-07, |
|
"loss": 0.0344, |
|
"reward": 0.46859942376613617, |
|
"reward_std": 0.213957991451025, |
|
"rewards/improved_len_reward_dast": 0.46859942376613617, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 2216.5968627929688, |
|
"epoch": 0.22812702527543746, |
|
"grad_norm": 0.1301027089357376, |
|
"kl": 0.00576019287109375, |
|
"learning_rate": 9.561926902972378e-07, |
|
"loss": 0.0187, |
|
"reward": 0.4914589300751686, |
|
"reward_std": 0.2609393447637558, |
|
"rewards/improved_len_reward_dast": 0.4914589300751686, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 1875.9923095703125, |
|
"epoch": 0.23071937783538563, |
|
"grad_norm": 0.14184825122356415, |
|
"kl": 0.00583648681640625, |
|
"learning_rate": 9.544175096082838e-07, |
|
"loss": 0.0363, |
|
"reward": 0.5709837153553963, |
|
"reward_std": 0.2582616098225117, |
|
"rewards/improved_len_reward_dast": 0.5709837153553963, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 2039.7499694824219, |
|
"epoch": 0.23331173039533376, |
|
"grad_norm": 0.14913320541381836, |
|
"kl": 0.00595855712890625, |
|
"learning_rate": 9.526089882346172e-07, |
|
"loss": 0.045, |
|
"reward": 0.4515961930155754, |
|
"reward_std": 0.2736925035715103, |
|
"rewards/improved_len_reward_dast": 0.4515961930155754, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 2091.1401977539062, |
|
"epoch": 0.23590408295528192, |
|
"grad_norm": 0.15127432346343994, |
|
"kl": 0.005279541015625, |
|
"learning_rate": 9.507672752730001e-07, |
|
"loss": 0.0054, |
|
"reward": 0.402485728263855, |
|
"reward_std": 0.29755549877882004, |
|
"rewards/improved_len_reward_dast": 0.402485728263855, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 2158.0076293945312, |
|
"epoch": 0.23849643551523006, |
|
"grad_norm": 0.13994944095611572, |
|
"kl": 0.0059356689453125, |
|
"learning_rate": 9.4889252255655e-07, |
|
"loss": 0.0189, |
|
"reward": 0.4199903607368469, |
|
"reward_std": 0.2342899888753891, |
|
"rewards/improved_len_reward_dast": 0.4199903607368469, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 2170.688720703125, |
|
"epoch": 0.24108878807517822, |
|
"grad_norm": 0.1373533308506012, |
|
"kl": 0.00689697265625, |
|
"learning_rate": 9.469848846422223e-07, |
|
"loss": -0.0002, |
|
"reward": 0.2835959419608116, |
|
"reward_std": 0.25099899992346764, |
|
"rewards/improved_len_reward_dast": 0.2835959419608116, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 2395.0025329589844, |
|
"epoch": 0.2436811406351264, |
|
"grad_norm": 0.1902032494544983, |
|
"kl": 0.00583648681640625, |
|
"learning_rate": 9.450445187980699e-07, |
|
"loss": 0.0584, |
|
"reward": 0.35247352346777916, |
|
"reward_std": 0.30481256917119026, |
|
"rewards/improved_len_reward_dast": 0.35247352346777916, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 2286.4693603515625, |
|
"epoch": 0.24627349319507452, |
|
"grad_norm": 0.13147291541099548, |
|
"kl": 0.00756072998046875, |
|
"learning_rate": 9.430715849902774e-07, |
|
"loss": -0.0056, |
|
"reward": 0.4530554786324501, |
|
"reward_std": 0.25667278096079826, |
|
"rewards/improved_len_reward_dast": 0.4530554786324501, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 2043.8111877441406, |
|
"epoch": 0.24886584575502269, |
|
"grad_norm": 0.18014930188655853, |
|
"kl": 0.00627899169921875, |
|
"learning_rate": 9.410662458699723e-07, |
|
"loss": 0.0794, |
|
"reward": 0.48161032050848007, |
|
"reward_std": 0.21756469458341599, |
|
"rewards/improved_len_reward_dast": 0.48161032050848007, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 1489.5254821777344, |
|
"epoch": 0.25145819831497085, |
|
"grad_norm": 0.17594148218631744, |
|
"kl": 0.0061187744140625, |
|
"learning_rate": 9.390286667598169e-07, |
|
"loss": 0.0505, |
|
"reward": 0.4777970463037491, |
|
"reward_std": 0.2606087028980255, |
|
"rewards/improved_len_reward_dast": 0.4777970463037491, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 1849.7627258300781, |
|
"epoch": 0.254050550874919, |
|
"grad_norm": 0.18699869513511658, |
|
"kl": 0.00595855712890625, |
|
"learning_rate": 9.369590156403784e-07, |
|
"loss": 0.021, |
|
"reward": 0.5154428780078888, |
|
"reward_std": 0.25940926000475883, |
|
"rewards/improved_len_reward_dast": 0.5154428780078888, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 1921.4820861816406, |
|
"epoch": 0.2566429034348671, |
|
"grad_norm": 0.17932431399822235, |
|
"kl": 0.00734710693359375, |
|
"learning_rate": 9.348574631362808e-07, |
|
"loss": 0.0448, |
|
"reward": 0.518772654235363, |
|
"reward_std": 0.22472433000802994, |
|
"rewards/improved_len_reward_dast": 0.518772654235363, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 1558.698959350586, |
|
"epoch": 0.2592352559948153, |
|
"grad_norm": 0.2216687798500061, |
|
"kl": 0.00499725341796875, |
|
"learning_rate": 9.327241825021379e-07, |
|
"loss": 0.0994, |
|
"reward": 0.5730864778161049, |
|
"reward_std": 0.23495277762413025, |
|
"rewards/improved_len_reward_dast": 0.5730864778161049, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 2075.681121826172, |
|
"epoch": 0.26182760855476345, |
|
"grad_norm": 0.17743688821792603, |
|
"kl": 0.00689697265625, |
|
"learning_rate": 9.3055934960827e-07, |
|
"loss": 0.0504, |
|
"reward": 0.49749719351530075, |
|
"reward_std": 0.24989648535847664, |
|
"rewards/improved_len_reward_dast": 0.49749719351530075, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 2060.5254821777344, |
|
"epoch": 0.2644199611147116, |
|
"grad_norm": 0.1442503184080124, |
|
"kl": 0.00804901123046875, |
|
"learning_rate": 9.283631429262053e-07, |
|
"loss": 0.0444, |
|
"reward": 0.5253574028611183, |
|
"reward_std": 0.26894206926226616, |
|
"rewards/improved_len_reward_dast": 0.5253574028611183, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 1975.1708679199219, |
|
"epoch": 0.2670123136746598, |
|
"grad_norm": 0.1966535896062851, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 9.261357435139665e-07, |
|
"loss": -0.0423, |
|
"reward": 0.3904944434762001, |
|
"reward_std": 0.252847608178854, |
|
"rewards/improved_len_reward_dast": 0.3904944434762001, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 1903.8239440917969, |
|
"epoch": 0.2696046662346079, |
|
"grad_norm": 0.16377773880958557, |
|
"kl": 0.00766754150390625, |
|
"learning_rate": 9.238773350011437e-07, |
|
"loss": 0.0337, |
|
"reward": 0.5364998355507851, |
|
"reward_std": 0.22807660326361656, |
|
"rewards/improved_len_reward_dast": 0.5364998355507851, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 1964.03564453125, |
|
"epoch": 0.27219701879455604, |
|
"grad_norm": 0.15731562674045563, |
|
"kl": 0.00638580322265625, |
|
"learning_rate": 9.215881035737557e-07, |
|
"loss": 0.03, |
|
"reward": 0.5687462911009789, |
|
"reward_std": 0.24159640073776245, |
|
"rewards/improved_len_reward_dast": 0.5687462911009789, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 2086.864776611328, |
|
"epoch": 0.2747893713545042, |
|
"grad_norm": 0.1402043104171753, |
|
"kl": 0.007659912109375, |
|
"learning_rate": 9.192682379589017e-07, |
|
"loss": 0.0097, |
|
"reward": 0.5089325457811356, |
|
"reward_std": 0.3301768898963928, |
|
"rewards/improved_len_reward_dast": 0.5089325457811356, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 2234.5228576660156, |
|
"epoch": 0.27738172391445237, |
|
"grad_norm": 0.11498509347438812, |
|
"kl": 0.0081634521484375, |
|
"learning_rate": 9.169179294092006e-07, |
|
"loss": 0.0083, |
|
"reward": 0.4969679266214371, |
|
"reward_std": 0.23030569776892662, |
|
"rewards/improved_len_reward_dast": 0.4969679266214371, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 1707.2627258300781, |
|
"epoch": 0.27997407647440054, |
|
"grad_norm": 0.14997334778308868, |
|
"kl": 0.006744384765625, |
|
"learning_rate": 9.145373716870257e-07, |
|
"loss": 0.0103, |
|
"reward": 0.49637529253959656, |
|
"reward_std": 0.2502065673470497, |
|
"rewards/improved_len_reward_dast": 0.49637529253959656, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 2420.5662231445312, |
|
"epoch": 0.2825664290343487, |
|
"grad_norm": 0.18093426525592804, |
|
"kl": 0.009765625, |
|
"learning_rate": 9.121267610485294e-07, |
|
"loss": 0.0507, |
|
"reward": 0.45108526200056076, |
|
"reward_std": 0.27778685092926025, |
|
"rewards/improved_len_reward_dast": 0.45108526200056076, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 1913.4413146972656, |
|
"epoch": 0.2851587815942968, |
|
"grad_norm": 0.129420667886734, |
|
"kl": 0.00807952880859375, |
|
"learning_rate": 9.096862962274642e-07, |
|
"loss": 0.0184, |
|
"reward": 0.47038712725043297, |
|
"reward_std": 0.23162546008825302, |
|
"rewards/improved_len_reward_dast": 0.47038712725043297, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 2138.313751220703, |
|
"epoch": 0.28775113415424497, |
|
"grad_norm": 0.13907843828201294, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 9.072161784187988e-07, |
|
"loss": 0.003, |
|
"reward": 0.39113760739564896, |
|
"reward_std": 0.31567446142435074, |
|
"rewards/improved_len_reward_dast": 0.39113760739564896, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 1697.0076293945312, |
|
"epoch": 0.29034348671419313, |
|
"grad_norm": 0.1390790045261383, |
|
"kl": 0.00740814208984375, |
|
"learning_rate": 9.047166112621312e-07, |
|
"loss": 0.0311, |
|
"reward": 0.4854539856314659, |
|
"reward_std": 0.25831175222992897, |
|
"rewards/improved_len_reward_dast": 0.4854539856314659, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 1920.2116394042969, |
|
"epoch": 0.2929358392741413, |
|
"grad_norm": 0.15448330342769623, |
|
"kl": 0.008331298828125, |
|
"learning_rate": 9.021878008249001e-07, |
|
"loss": 0.0309, |
|
"reward": 0.5033985450863838, |
|
"reward_std": 0.2514248192310333, |
|
"rewards/improved_len_reward_dast": 0.5033985450863838, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 1778.3877258300781, |
|
"epoch": 0.29552819183408946, |
|
"grad_norm": 0.18551130592823029, |
|
"kl": 0.0085601806640625, |
|
"learning_rate": 8.996299555853973e-07, |
|
"loss": 0.0592, |
|
"reward": 0.5357353314757347, |
|
"reward_std": 0.25978413224220276, |
|
"rewards/improved_len_reward_dast": 0.5357353314757347, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 1744.0815734863281, |
|
"epoch": 0.29812054439403757, |
|
"grad_norm": 0.1741955578327179, |
|
"kl": 0.008270263671875, |
|
"learning_rate": 8.970432864155798e-07, |
|
"loss": 0.0581, |
|
"reward": 0.44869130104780197, |
|
"reward_std": 0.27859310433268547, |
|
"rewards/improved_len_reward_dast": 0.44869130104780197, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 1891.1683044433594, |
|
"epoch": 0.30071289695398573, |
|
"grad_norm": 0.16727615892887115, |
|
"kl": 0.0091094970703125, |
|
"learning_rate": 8.944280065636851e-07, |
|
"loss": -0.0155, |
|
"reward": 0.5161800310015678, |
|
"reward_std": 0.2364092469215393, |
|
"rewards/improved_len_reward_dast": 0.5161800310015678, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 1760.517822265625, |
|
"epoch": 0.3033052495139339, |
|
"grad_norm": 0.16244257986545563, |
|
"kl": 0.0087127685546875, |
|
"learning_rate": 8.917843316366515e-07, |
|
"loss": 0.0261, |
|
"reward": 0.5459260642528534, |
|
"reward_std": 0.23583999276161194, |
|
"rewards/improved_len_reward_dast": 0.5459260642528534, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 1983.0484619140625, |
|
"epoch": 0.30589760207388206, |
|
"grad_norm": 0.1473926603794098, |
|
"kl": 0.00963592529296875, |
|
"learning_rate": 8.891124795823426e-07, |
|
"loss": 0.0132, |
|
"reward": 0.3325341437011957, |
|
"reward_std": 0.25720784440636635, |
|
"rewards/improved_len_reward_dast": 0.3325341437011957, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 1990.2703552246094, |
|
"epoch": 0.3084899546338302, |
|
"grad_norm": 0.15402625501155853, |
|
"kl": 0.009613037109375, |
|
"learning_rate": 8.864126706715796e-07, |
|
"loss": 0.0303, |
|
"reward": 0.47167035937309265, |
|
"reward_std": 0.20868681743741035, |
|
"rewards/improved_len_reward_dast": 0.47167035937309265, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 2067.3621826171875, |
|
"epoch": 0.31108230719377833, |
|
"grad_norm": 0.14856307208538055, |
|
"kl": 0.0103302001953125, |
|
"learning_rate": 8.83685127479982e-07, |
|
"loss": 0.0497, |
|
"reward": 0.5158610492944717, |
|
"reward_std": 0.24829266592860222, |
|
"rewards/improved_len_reward_dast": 0.5158610492944717, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 1695.1377410888672, |
|
"epoch": 0.3136746597537265, |
|
"grad_norm": 0.1789526492357254, |
|
"kl": 0.00838470458984375, |
|
"learning_rate": 8.809300748696173e-07, |
|
"loss": 0.0452, |
|
"reward": 0.4681314527988434, |
|
"reward_std": 0.27835647389292717, |
|
"rewards/improved_len_reward_dast": 0.4681314527988434, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 2345.4464111328125, |
|
"epoch": 0.31626701231367466, |
|
"grad_norm": 0.1702735722064972, |
|
"kl": 0.01092529296875, |
|
"learning_rate": 8.781477399704652e-07, |
|
"loss": 0.0505, |
|
"reward": 0.4150802828371525, |
|
"reward_std": 0.2282225303351879, |
|
"rewards/improved_len_reward_dast": 0.4150802828371525, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 2215.8545532226562, |
|
"epoch": 0.3188593648736228, |
|
"grad_norm": 0.1643056720495224, |
|
"kl": 0.010467529296875, |
|
"learning_rate": 8.753383521616902e-07, |
|
"loss": -0.0006, |
|
"reward": 0.4899800196290016, |
|
"reward_std": 0.2781127095222473, |
|
"rewards/improved_len_reward_dast": 0.4899800196290016, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 1792.7601928710938, |
|
"epoch": 0.321451717433571, |
|
"grad_norm": 0.18634077906608582, |
|
"kl": 0.00847625732421875, |
|
"learning_rate": 8.72502143052733e-07, |
|
"loss": 0.0014, |
|
"reward": 0.3171217106282711, |
|
"reward_std": 0.27590419724583626, |
|
"rewards/improved_len_reward_dast": 0.3171217106282711, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 1776.8545227050781, |
|
"epoch": 0.32404406999351915, |
|
"grad_norm": 0.12118836492300034, |
|
"kl": 0.0086212158203125, |
|
"learning_rate": 8.696393464642158e-07, |
|
"loss": -0.0068, |
|
"reward": 0.5544345825910568, |
|
"reward_std": 0.24507181718945503, |
|
"rewards/improved_len_reward_dast": 0.5544345825910568, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 1694.5203857421875, |
|
"epoch": 0.32663642255346725, |
|
"grad_norm": 0.1287376880645752, |
|
"kl": 0.00778961181640625, |
|
"learning_rate": 8.667501984086655e-07, |
|
"loss": 0.007, |
|
"reward": 0.5977945774793625, |
|
"reward_std": 0.245724493637681, |
|
"rewards/improved_len_reward_dast": 0.5977945774793625, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 1636.5458679199219, |
|
"epoch": 0.3292287751134154, |
|
"grad_norm": 0.14778032898902893, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 8.638349370710573e-07, |
|
"loss": 0.0288, |
|
"reward": 0.4892159327864647, |
|
"reward_std": 0.21941957622766495, |
|
"rewards/improved_len_reward_dast": 0.4892159327864647, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 1714.7295532226562, |
|
"epoch": 0.3318211276733636, |
|
"grad_norm": 0.1856471598148346, |
|
"kl": 0.00775909423828125, |
|
"learning_rate": 8.608938027891775e-07, |
|
"loss": -0.0209, |
|
"reward": 0.49680083245038986, |
|
"reward_std": 0.31941552460193634, |
|
"rewards/improved_len_reward_dast": 0.49680083245038986, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 1622.4820861816406, |
|
"epoch": 0.33441348023331174, |
|
"grad_norm": 0.14166595041751862, |
|
"kl": 0.00904083251953125, |
|
"learning_rate": 8.579270380338107e-07, |
|
"loss": 0.0349, |
|
"reward": 0.5298355668783188, |
|
"reward_std": 0.25498587638139725, |
|
"rewards/improved_len_reward_dast": 0.5298355668783188, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 2226.5203552246094, |
|
"epoch": 0.3370058327932599, |
|
"grad_norm": 0.13678768277168274, |
|
"kl": 0.010528564453125, |
|
"learning_rate": 8.549348873887496e-07, |
|
"loss": 0.0306, |
|
"reward": 0.32516562566161156, |
|
"reward_std": 0.2915949523448944, |
|
"rewards/improved_len_reward_dast": 0.32516562566161156, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 1750.5968933105469, |
|
"epoch": 0.339598185353208, |
|
"grad_norm": 0.6094262003898621, |
|
"kl": 0.01262664794921875, |
|
"learning_rate": 8.519175975306312e-07, |
|
"loss": 0.0175, |
|
"reward": 0.34795505669899285, |
|
"reward_std": 0.29810576513409615, |
|
"rewards/improved_len_reward_dast": 0.34795505669899285, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 1810.3775024414062, |
|
"epoch": 0.3421905379131562, |
|
"grad_norm": 0.13672804832458496, |
|
"kl": 0.0082855224609375, |
|
"learning_rate": 8.48875417208601e-07, |
|
"loss": 0.0123, |
|
"reward": 0.5421003252267838, |
|
"reward_std": 0.23054074123501778, |
|
"rewards/improved_len_reward_dast": 0.5421003252267838, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 1827.4871826171875, |
|
"epoch": 0.34478289047310434, |
|
"grad_norm": 0.18760238587856293, |
|
"kl": 0.00931549072265625, |
|
"learning_rate": 8.458085972238048e-07, |
|
"loss": -0.0378, |
|
"reward": 0.3316265791654587, |
|
"reward_std": 0.3142329826951027, |
|
"rewards/improved_len_reward_dast": 0.3316265791654587, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 1851.31884765625, |
|
"epoch": 0.3473752430330525, |
|
"grad_norm": 0.17281284928321838, |
|
"kl": 0.01065826416015625, |
|
"learning_rate": 8.427173904087138e-07, |
|
"loss": 0.0098, |
|
"reward": 0.3921409696340561, |
|
"reward_std": 0.3084140866994858, |
|
"rewards/improved_len_reward_dast": 0.3921409696340561, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 2403.4004516601562, |
|
"epoch": 0.34996759559300067, |
|
"grad_norm": 0.12713146209716797, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 8.396020516062794e-07, |
|
"loss": 0.0099, |
|
"reward": 0.3863501325249672, |
|
"reward_std": 0.23984722048044205, |
|
"rewards/improved_len_reward_dast": 0.3863501325249672, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 1436.4285278320312, |
|
"epoch": 0.3525599481529488, |
|
"grad_norm": 0.23622222244739532, |
|
"kl": 0.00687408447265625, |
|
"learning_rate": 8.364628376489242e-07, |
|
"loss": 0.0785, |
|
"reward": 0.6200843900442123, |
|
"reward_std": 0.2323874980211258, |
|
"rewards/improved_len_reward_dast": 0.6200843900442123, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 1340.3367004394531, |
|
"epoch": 0.35515230071289694, |
|
"grad_norm": 0.14602254331111908, |
|
"kl": 0.0074920654296875, |
|
"learning_rate": 8.333000073373685e-07, |
|
"loss": 0.0062, |
|
"reward": 0.5035427659749985, |
|
"reward_std": 0.2632300853729248, |
|
"rewards/improved_len_reward_dast": 0.5035427659749985, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 1822.9515075683594, |
|
"epoch": 0.3577446532728451, |
|
"grad_norm": 0.1399223655462265, |
|
"kl": 0.0090789794921875, |
|
"learning_rate": 8.301138214192945e-07, |
|
"loss": 0.0151, |
|
"reward": 0.45684105157852173, |
|
"reward_std": 0.28661157563328743, |
|
"rewards/improved_len_reward_dast": 0.45684105157852173, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 1932.7831115722656, |
|
"epoch": 0.36033700583279327, |
|
"grad_norm": 0.14497311413288116, |
|
"kl": 0.010772705078125, |
|
"learning_rate": 8.269045425678497e-07, |
|
"loss": 0.026, |
|
"reward": 0.5445379167795181, |
|
"reward_std": 0.23734620586037636, |
|
"rewards/improved_len_reward_dast": 0.5445379167795181, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 1724.9464111328125, |
|
"epoch": 0.36292935839274143, |
|
"grad_norm": 0.15263906121253967, |
|
"kl": 0.00861358642578125, |
|
"learning_rate": 8.236724353599918e-07, |
|
"loss": 0.0175, |
|
"reward": 0.667856439948082, |
|
"reward_std": 0.23194020241498947, |
|
"rewards/improved_len_reward_dast": 0.667856439948082, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 1628.3902893066406, |
|
"epoch": 0.36552171095268954, |
|
"grad_norm": 0.1453440636396408, |
|
"kl": 0.00885009765625, |
|
"learning_rate": 8.204177662546763e-07, |
|
"loss": 0.0253, |
|
"reward": 0.5715875178575516, |
|
"reward_std": 0.20206843689084053, |
|
"rewards/improved_len_reward_dast": 0.5715875178575516, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 1601.8902893066406, |
|
"epoch": 0.3681140635126377, |
|
"grad_norm": 0.24610204994678497, |
|
"kl": 0.00897979736328125, |
|
"learning_rate": 8.171408035708906e-07, |
|
"loss": 0.077, |
|
"reward": 0.5970557183027267, |
|
"reward_std": 0.21772165596485138, |
|
"rewards/improved_len_reward_dast": 0.5970557183027267, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 1380.9310913085938, |
|
"epoch": 0.37070641607258586, |
|
"grad_norm": 0.1606459617614746, |
|
"kl": 0.0074005126953125, |
|
"learning_rate": 8.138418174655323e-07, |
|
"loss": 0.0208, |
|
"reward": 0.6311784163117409, |
|
"reward_std": 0.21403341740369797, |
|
"rewards/improved_len_reward_dast": 0.6311784163117409, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 2226.7908325195312, |
|
"epoch": 0.37329876863253403, |
|
"grad_norm": 0.13883228600025177, |
|
"kl": 0.0110626220703125, |
|
"learning_rate": 8.105210799111366e-07, |
|
"loss": 0.0192, |
|
"reward": 0.4252306818962097, |
|
"reward_std": 0.2911713309586048, |
|
"rewards/improved_len_reward_dast": 0.4252306818962097, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 1767.211669921875, |
|
"epoch": 0.3758911211924822, |
|
"grad_norm": 0.13391607999801636, |
|
"kl": 0.01000213623046875, |
|
"learning_rate": 8.071788646734564e-07, |
|
"loss": -0.0125, |
|
"reward": 0.5215009152889252, |
|
"reward_std": 0.20849771052598953, |
|
"rewards/improved_len_reward_dast": 0.5215009152889252, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 1668.6147766113281, |
|
"epoch": 0.37848347375243035, |
|
"grad_norm": 0.14263834059238434, |
|
"kl": 0.00905609130859375, |
|
"learning_rate": 8.038154472888909e-07, |
|
"loss": -0.0016, |
|
"reward": 0.5347848311066628, |
|
"reward_std": 0.2661595940589905, |
|
"rewards/improved_len_reward_dast": 0.5347848311066628, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 1830.3596801757812, |
|
"epoch": 0.38107582631237846, |
|
"grad_norm": 0.17779265344142914, |
|
"kl": 0.0116119384765625, |
|
"learning_rate": 8.004311050417711e-07, |
|
"loss": -0.0093, |
|
"reward": 0.49393337965011597, |
|
"reward_std": 0.2844499684870243, |
|
"rewards/improved_len_reward_dast": 0.49393337965011597, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 1709.0025329589844, |
|
"epoch": 0.3836681788723266, |
|
"grad_norm": 0.15148715674877167, |
|
"kl": 0.0105743408203125, |
|
"learning_rate": 7.970261169414999e-07, |
|
"loss": 0.0157, |
|
"reward": 0.5047090724110603, |
|
"reward_std": 0.2441636137664318, |
|
"rewards/improved_len_reward_dast": 0.5047090724110603, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 2119.3775024414062, |
|
"epoch": 0.3862605314322748, |
|
"grad_norm": 0.22775354981422424, |
|
"kl": 0.0129241943359375, |
|
"learning_rate": 7.936007636995497e-07, |
|
"loss": 0.0774, |
|
"reward": 0.49651817977428436, |
|
"reward_std": 0.2400597222149372, |
|
"rewards/improved_len_reward_dast": 0.49651817977428436, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 1689.4004821777344, |
|
"epoch": 0.38885288399222295, |
|
"grad_norm": 0.15328077971935272, |
|
"kl": 0.00994873046875, |
|
"learning_rate": 7.901553277063213e-07, |
|
"loss": 0.0078, |
|
"reward": 0.3569689057767391, |
|
"reward_std": 0.3229844532907009, |
|
"rewards/improved_len_reward_dast": 0.3569689057767391, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 2073.864776611328, |
|
"epoch": 0.3914452365521711, |
|
"grad_norm": 0.19549565017223358, |
|
"kl": 0.0137786865234375, |
|
"learning_rate": 7.866900930078618e-07, |
|
"loss": 0.0528, |
|
"reward": 0.5197746828198433, |
|
"reward_std": 0.24571574851870537, |
|
"rewards/improved_len_reward_dast": 0.5197746828198433, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 1904.7984313964844, |
|
"epoch": 0.3940375891121192, |
|
"grad_norm": 0.16537566483020782, |
|
"kl": 0.0112762451171875, |
|
"learning_rate": 7.832053452824489e-07, |
|
"loss": 0.0443, |
|
"reward": 0.5653045251965523, |
|
"reward_std": 0.26458077877759933, |
|
"rewards/improved_len_reward_dast": 0.5653045251965523, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 1810.0535278320312, |
|
"epoch": 0.3966299416720674, |
|
"grad_norm": 0.1673276126384735, |
|
"kl": 0.01409912109375, |
|
"learning_rate": 7.797013718170384e-07, |
|
"loss": 0.0361, |
|
"reward": 0.4529588147997856, |
|
"reward_std": 0.24421193450689316, |
|
"rewards/improved_len_reward_dast": 0.4529588147997856, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 1624.7015075683594, |
|
"epoch": 0.39922229423201555, |
|
"grad_norm": 0.15807899832725525, |
|
"kl": 0.011444091796875, |
|
"learning_rate": 7.761784614835801e-07, |
|
"loss": 0.014, |
|
"reward": 0.4734058678150177, |
|
"reward_std": 0.32842234522104263, |
|
"rewards/improved_len_reward_dast": 0.4734058678150177, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 1876.540756225586, |
|
"epoch": 0.4018146467919637, |
|
"grad_norm": 0.18241144716739655, |
|
"kl": 0.01232147216796875, |
|
"learning_rate": 7.726369047152029e-07, |
|
"loss": 0.0244, |
|
"reward": 0.4645438566803932, |
|
"reward_std": 0.2389094103127718, |
|
"rewards/improved_len_reward_dast": 0.4645438566803932, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 1693.3367004394531, |
|
"epoch": 0.4044069993519119, |
|
"grad_norm": 0.17326728999614716, |
|
"kl": 0.00902557373046875, |
|
"learning_rate": 7.690769934822712e-07, |
|
"loss": 0.0494, |
|
"reward": 0.4986276477575302, |
|
"reward_std": 0.2973395735025406, |
|
"rewards/improved_len_reward_dast": 0.4986276477575302, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 1658.6785278320312, |
|
"epoch": 0.40699935191186, |
|
"grad_norm": 0.20593588054180145, |
|
"kl": 0.0114288330078125, |
|
"learning_rate": 7.654990212683142e-07, |
|
"loss": -0.0131, |
|
"reward": 0.5161425247788429, |
|
"reward_std": 0.2771513797342777, |
|
"rewards/improved_len_reward_dast": 0.5161425247788429, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 1728.2755126953125, |
|
"epoch": 0.40959170447180815, |
|
"grad_norm": 0.16088007390499115, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 7.619032830458307e-07, |
|
"loss": 0.0392, |
|
"reward": 0.6053505837917328, |
|
"reward_std": 0.22085025534033775, |
|
"rewards/improved_len_reward_dast": 0.6053505837917328, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 2031.5408325195312, |
|
"epoch": 0.4121840570317563, |
|
"grad_norm": 0.1635725349187851, |
|
"kl": 0.01397705078125, |
|
"learning_rate": 7.582900752519723e-07, |
|
"loss": -0.0071, |
|
"reward": 0.5291006043553352, |
|
"reward_std": 0.253378227353096, |
|
"rewards/improved_len_reward_dast": 0.5291006043553352, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 2087.8060607910156, |
|
"epoch": 0.4147764095917045, |
|
"grad_norm": 0.19703420996665955, |
|
"kl": 0.013885498046875, |
|
"learning_rate": 7.546596957641031e-07, |
|
"loss": 0.0142, |
|
"reward": 0.4236784651875496, |
|
"reward_std": 0.264580138027668, |
|
"rewards/improved_len_reward_dast": 0.4236784651875496, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 1966.3953247070312, |
|
"epoch": 0.41736876215165264, |
|
"grad_norm": 0.17154966294765472, |
|
"kl": 0.016204833984375, |
|
"learning_rate": 7.510124438752432e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5379416197538376, |
|
"reward_std": 0.2562957741320133, |
|
"rewards/improved_len_reward_dast": 0.5379416197538376, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 1525.1632080078125, |
|
"epoch": 0.4199611147116008, |
|
"grad_norm": 0.14776575565338135, |
|
"kl": 0.01141357421875, |
|
"learning_rate": 7.473486202693949e-07, |
|
"loss": 0.0315, |
|
"reward": 0.69241763651371, |
|
"reward_std": 0.22519692406058311, |
|
"rewards/improved_len_reward_dast": 0.69241763651371, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 1797.1249694824219, |
|
"epoch": 0.4225534672715489, |
|
"grad_norm": 0.17613932490348816, |
|
"kl": 0.0154266357421875, |
|
"learning_rate": 7.43668526996753e-07, |
|
"loss": 0.0094, |
|
"reward": 0.48756927251815796, |
|
"reward_std": 0.300619401037693, |
|
"rewards/improved_len_reward_dast": 0.48756927251815796, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 1866.8596801757812, |
|
"epoch": 0.4251458198314971, |
|
"grad_norm": 0.1625932902097702, |
|
"kl": 0.015716552734375, |
|
"learning_rate": 7.399724674488046e-07, |
|
"loss": 0.0021, |
|
"reward": 0.46739767491817474, |
|
"reward_std": 0.24503038078546524, |
|
"rewards/improved_len_reward_dast": 0.46739767491817474, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 1815.2423095703125, |
|
"epoch": 0.42773817239144524, |
|
"grad_norm": 0.23522265255451202, |
|
"kl": 0.01446533203125, |
|
"learning_rate": 7.36260746333316e-07, |
|
"loss": 0.049, |
|
"reward": 0.5157830119132996, |
|
"reward_std": 0.1916877217590809, |
|
"rewards/improved_len_reward_dast": 0.5157830119132996, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1679.2806091308594, |
|
"epoch": 0.4303305249513934, |
|
"grad_norm": 0.1362147033214569, |
|
"kl": 0.0125274658203125, |
|
"learning_rate": 7.325336696492128e-07, |
|
"loss": 0.0273, |
|
"reward": 0.5556403249502182, |
|
"reward_std": 0.22342020645737648, |
|
"rewards/improved_len_reward_dast": 0.5556403249502182, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 1882.0254516601562, |
|
"epoch": 0.43292287751134156, |
|
"grad_norm": 0.17028290033340454, |
|
"kl": 0.0140838623046875, |
|
"learning_rate": 7.287915446613531e-07, |
|
"loss": 0.0281, |
|
"reward": 0.48191484808921814, |
|
"reward_std": 0.2616124339401722, |
|
"rewards/improved_len_reward_dast": 0.48191484808921814, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 2080.8468627929688, |
|
"epoch": 0.43551523007128967, |
|
"grad_norm": 0.16656753420829773, |
|
"kl": 0.019134521484375, |
|
"learning_rate": 7.250346798751953e-07, |
|
"loss": -0.0133, |
|
"reward": 0.4320894777774811, |
|
"reward_std": 0.30758891999721527, |
|
"rewards/improved_len_reward_dast": 0.4320894777774811, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 1616.6836700439453, |
|
"epoch": 0.43810758263123784, |
|
"grad_norm": 0.17435774207115173, |
|
"kl": 0.0129547119140625, |
|
"learning_rate": 7.212633850113662e-07, |
|
"loss": -0.0002, |
|
"reward": 0.43373018503189087, |
|
"reward_std": 0.26510076597332954, |
|
"rewards/improved_len_reward_dast": 0.43373018503189087, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 1309.869857788086, |
|
"epoch": 0.440699935191186, |
|
"grad_norm": 0.15860451757907867, |
|
"kl": 0.011932373046875, |
|
"learning_rate": 7.174779709801253e-07, |
|
"loss": -0.0072, |
|
"reward": 0.4780568554997444, |
|
"reward_std": 0.2705870047211647, |
|
"rewards/improved_len_reward_dast": 0.4780568554997444, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 1558.0025329589844, |
|
"epoch": 0.44329228775113416, |
|
"grad_norm": 0.12908661365509033, |
|
"kl": 0.01216888427734375, |
|
"learning_rate": 7.136787498557344e-07, |
|
"loss": -0.0114, |
|
"reward": 0.47240160405635834, |
|
"reward_std": 0.30547885224223137, |
|
"rewards/improved_len_reward_dast": 0.47240160405635834, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 1665.77294921875, |
|
"epoch": 0.4458846403110823, |
|
"grad_norm": 0.1525769829750061, |
|
"kl": 0.0130615234375, |
|
"learning_rate": 7.098660348507293e-07, |
|
"loss": -0.0124, |
|
"reward": 0.5375373288989067, |
|
"reward_std": 0.25483621656894684, |
|
"rewards/improved_len_reward_dast": 0.5375373288989067, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 1853.0943603515625, |
|
"epoch": 0.44847699287103043, |
|
"grad_norm": 0.13048604130744934, |
|
"kl": 0.0130157470703125, |
|
"learning_rate": 7.060401402900977e-07, |
|
"loss": -0.0133, |
|
"reward": 0.45648840069770813, |
|
"reward_std": 0.2915825918316841, |
|
"rewards/improved_len_reward_dast": 0.45648840069770813, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 1776.0254516601562, |
|
"epoch": 0.4510693454309786, |
|
"grad_norm": 0.14781691133975983, |
|
"kl": 0.0131683349609375, |
|
"learning_rate": 7.022013815853672e-07, |
|
"loss": -0.0126, |
|
"reward": 0.4387430027127266, |
|
"reward_std": 0.301468089222908, |
|
"rewards/improved_len_reward_dast": 0.4387430027127266, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 1614.7882385253906, |
|
"epoch": 0.45366169799092676, |
|
"grad_norm": 0.1574937105178833, |
|
"kl": 0.0132598876953125, |
|
"learning_rate": 6.983500752086006e-07, |
|
"loss": 0.0277, |
|
"reward": 0.5207101553678513, |
|
"reward_std": 0.26092710718512535, |
|
"rewards/improved_len_reward_dast": 0.5207101553678513, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 1572.3290405273438, |
|
"epoch": 0.4562540505508749, |
|
"grad_norm": 0.13418346643447876, |
|
"kl": 0.0114593505859375, |
|
"learning_rate": 6.94486538666307e-07, |
|
"loss": 0.017, |
|
"reward": 0.5103653743863106, |
|
"reward_std": 0.24962808936834335, |
|
"rewards/improved_len_reward_dast": 0.5103653743863106, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 1609.9362182617188, |
|
"epoch": 0.4588464031108231, |
|
"grad_norm": 0.15368600189685822, |
|
"kl": 0.0113983154296875, |
|
"learning_rate": 6.906110904732656e-07, |
|
"loss": 0.0098, |
|
"reward": 0.571323998272419, |
|
"reward_std": 0.2758530154824257, |
|
"rewards/improved_len_reward_dast": 0.571323998272419, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 2069.033172607422, |
|
"epoch": 0.46143875567077125, |
|
"grad_norm": 0.1310436725616455, |
|
"kl": 0.012939453125, |
|
"learning_rate": 6.867240501262666e-07, |
|
"loss": 0.0214, |
|
"reward": 0.537315845489502, |
|
"reward_std": 0.21321317180991173, |
|
"rewards/improved_len_reward_dast": 0.537315845489502, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 1546.3826293945312, |
|
"epoch": 0.46403110823071936, |
|
"grad_norm": 0.18392066657543182, |
|
"kl": 0.011871337890625, |
|
"learning_rate": 6.828257380777723e-07, |
|
"loss": -0.0405, |
|
"reward": 0.337811965495348, |
|
"reward_std": 0.30249205976724625, |
|
"rewards/improved_len_reward_dast": 0.337811965495348, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 1793.1861572265625, |
|
"epoch": 0.4666234607906675, |
|
"grad_norm": 0.13693319261074066, |
|
"kl": 0.0158233642578125, |
|
"learning_rate": 6.789164757094978e-07, |
|
"loss": 0.0131, |
|
"reward": 0.4841275066137314, |
|
"reward_std": 0.28166862949728966, |
|
"rewards/improved_len_reward_dast": 0.4841275066137314, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 1669.7703552246094, |
|
"epoch": 0.4692158133506157, |
|
"grad_norm": 0.26192930340766907, |
|
"kl": 0.012939453125, |
|
"learning_rate": 6.749965853059164e-07, |
|
"loss": 0.0681, |
|
"reward": 0.5609092861413956, |
|
"reward_std": 0.2805982828140259, |
|
"rewards/improved_len_reward_dast": 0.5609092861413956, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 1579.7372131347656, |
|
"epoch": 0.47180816591056385, |
|
"grad_norm": 0.13756124675273895, |
|
"kl": 0.01165008544921875, |
|
"learning_rate": 6.710663900276903e-07, |
|
"loss": -0.0036, |
|
"reward": 0.4818296991288662, |
|
"reward_std": 0.23825621232390404, |
|
"rewards/improved_len_reward_dast": 0.4818296991288662, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 1472.9540252685547, |
|
"epoch": 0.474400518470512, |
|
"grad_norm": 0.1440887451171875, |
|
"kl": 0.01090240478515625, |
|
"learning_rate": 6.671262138850274e-07, |
|
"loss": 0.0178, |
|
"reward": 0.6261176690459251, |
|
"reward_std": 0.2099764347076416, |
|
"rewards/improved_len_reward_dast": 0.6261176690459251, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 1391.3188171386719, |
|
"epoch": 0.4769928710304601, |
|
"grad_norm": 0.14985869824886322, |
|
"kl": 0.010223388671875, |
|
"learning_rate": 6.631763817109717e-07, |
|
"loss": 0.036, |
|
"reward": 0.6541325002908707, |
|
"reward_std": 0.20582681149244308, |
|
"rewards/improved_len_reward_dast": 0.6541325002908707, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 1469.6122436523438, |
|
"epoch": 0.4795852235904083, |
|
"grad_norm": 0.16654829680919647, |
|
"kl": 0.0106353759765625, |
|
"learning_rate": 6.592172191346218e-07, |
|
"loss": 0.0004, |
|
"reward": 0.5705170333385468, |
|
"reward_std": 0.25396620109677315, |
|
"rewards/improved_len_reward_dast": 0.5705170333385468, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 1663.2117004394531, |
|
"epoch": 0.48217757615035645, |
|
"grad_norm": 0.1463920623064041, |
|
"kl": 0.0134735107421875, |
|
"learning_rate": 6.552490525542864e-07, |
|
"loss": 0.0037, |
|
"reward": 0.5099420920014381, |
|
"reward_std": 0.25713133439421654, |
|
"rewards/improved_len_reward_dast": 0.5099420920014381, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 1614.6351318359375, |
|
"epoch": 0.4847699287103046, |
|
"grad_norm": 0.13490332663059235, |
|
"kl": 0.012054443359375, |
|
"learning_rate": 6.512722091105757e-07, |
|
"loss": -0.0025, |
|
"reward": 0.5862669795751572, |
|
"reward_std": 0.257910817861557, |
|
"rewards/improved_len_reward_dast": 0.5862669795751572, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1392.2372131347656, |
|
"epoch": 0.4873622812702528, |
|
"grad_norm": 0.14285001158714294, |
|
"kl": 0.0123291015625, |
|
"learning_rate": 6.472870166594314e-07, |
|
"loss": 0.0127, |
|
"reward": 0.6144573241472244, |
|
"reward_std": 0.2229880653321743, |
|
"rewards/improved_len_reward_dast": 0.6144573241472244, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 1885.2652587890625, |
|
"epoch": 0.4899546338302009, |
|
"grad_norm": 0.14947953820228577, |
|
"kl": 0.0140228271484375, |
|
"learning_rate": 6.432938037450974e-07, |
|
"loss": 0.0111, |
|
"reward": 0.5071591883897781, |
|
"reward_std": 0.23271573707461357, |
|
"rewards/improved_len_reward_dast": 0.5071591883897781, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 1780.61474609375, |
|
"epoch": 0.49254698639014904, |
|
"grad_norm": 0.1921232044696808, |
|
"kl": 0.0138397216796875, |
|
"learning_rate": 6.392928995730352e-07, |
|
"loss": 0.0336, |
|
"reward": 0.5300878472626209, |
|
"reward_std": 0.2751046009361744, |
|
"rewards/improved_len_reward_dast": 0.5300878472626209, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 1370.4795837402344, |
|
"epoch": 0.4951393389500972, |
|
"grad_norm": 0.2351725697517395, |
|
"kl": 0.01165008544921875, |
|
"learning_rate": 6.352846339827826e-07, |
|
"loss": 0.0776, |
|
"reward": 0.5745302811264992, |
|
"reward_std": 0.25346530973911285, |
|
"rewards/improved_len_reward_dast": 0.5745302811264992, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 1441.2831420898438, |
|
"epoch": 0.49773169151004537, |
|
"grad_norm": 0.16980963945388794, |
|
"kl": 0.013824462890625, |
|
"learning_rate": 6.312693374207627e-07, |
|
"loss": 0.0208, |
|
"reward": 0.548950806260109, |
|
"reward_std": 0.2456044964492321, |
|
"rewards/improved_len_reward_dast": 0.548950806260109, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 1563.9183349609375, |
|
"epoch": 0.5003240440699935, |
|
"grad_norm": 0.16622929275035858, |
|
"kl": 0.0142364501953125, |
|
"learning_rate": 6.272473409130397e-07, |
|
"loss": 0.0204, |
|
"reward": 0.550769068300724, |
|
"reward_std": 0.2540467455983162, |
|
"rewards/improved_len_reward_dast": 0.550769068300724, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 1447.869857788086, |
|
"epoch": 0.5029163966299417, |
|
"grad_norm": 0.15316490828990936, |
|
"kl": 0.010894775390625, |
|
"learning_rate": 6.232189760380301e-07, |
|
"loss": 0.0154, |
|
"reward": 0.5217381715774536, |
|
"reward_std": 0.28643129020929337, |
|
"rewards/improved_len_reward_dast": 0.5217381715774536, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 1797.6096496582031, |
|
"epoch": 0.5055087491898899, |
|
"grad_norm": 0.17682208120822906, |
|
"kl": 0.0141143798828125, |
|
"learning_rate": 6.191845748991671e-07, |
|
"loss": -0.0155, |
|
"reward": 0.48948052898049355, |
|
"reward_std": 0.21832110546529293, |
|
"rewards/improved_len_reward_dast": 0.48948052898049355, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 1715.7448425292969, |
|
"epoch": 0.508101101749838, |
|
"grad_norm": 0.1485513597726822, |
|
"kl": 0.0159454345703125, |
|
"learning_rate": 6.151444700975203e-07, |
|
"loss": 0.0098, |
|
"reward": 0.599296048283577, |
|
"reward_std": 0.2545859329402447, |
|
"rewards/improved_len_reward_dast": 0.599296048283577, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 2129.3111877441406, |
|
"epoch": 0.5106934543097861, |
|
"grad_norm": 0.15262384712696075, |
|
"kl": 0.017852783203125, |
|
"learning_rate": 6.110989947043767e-07, |
|
"loss": 0.0272, |
|
"reward": 0.41421886533498764, |
|
"reward_std": 0.3264440894126892, |
|
"rewards/improved_len_reward_dast": 0.41421886533498764, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 1410.6402740478516, |
|
"epoch": 0.5132858068697342, |
|
"grad_norm": 0.15426945686340332, |
|
"kl": 0.015106201171875, |
|
"learning_rate": 6.070484822337816e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5891918540000916, |
|
"reward_std": 0.23263467848300934, |
|
"rewards/improved_len_reward_dast": 0.5891918540000916, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 1698.915771484375, |
|
"epoch": 0.5158781594296824, |
|
"grad_norm": 0.2302210032939911, |
|
"kl": 0.016326904296875, |
|
"learning_rate": 6.029932666150431e-07, |
|
"loss": 0.0565, |
|
"reward": 0.5624502748250961, |
|
"reward_std": 0.23056710511446, |
|
"rewards/improved_len_reward_dast": 0.5624502748250961, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 1663.0356750488281, |
|
"epoch": 0.5184705119896306, |
|
"grad_norm": 0.15166351199150085, |
|
"kl": 0.016143798828125, |
|
"learning_rate": 5.989336821652029e-07, |
|
"loss": 0.0351, |
|
"reward": 0.5748374983668327, |
|
"reward_std": 0.231033306568861, |
|
"rewards/improved_len_reward_dast": 0.5748374983668327, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1331.0382385253906, |
|
"epoch": 0.5210628645495787, |
|
"grad_norm": 0.21695953607559204, |
|
"kl": 0.0140228271484375, |
|
"learning_rate": 5.948700635614745e-07, |
|
"loss": -0.0329, |
|
"reward": 0.35667416942305863, |
|
"reward_std": 0.29070717096328735, |
|
"rewards/improved_len_reward_dast": 0.35667416942305863, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1358.9055786132812, |
|
"epoch": 0.5236552171095269, |
|
"grad_norm": 0.18666747212409973, |
|
"kl": 0.0137786865234375, |
|
"learning_rate": 5.908027458136518e-07, |
|
"loss": 0.0408, |
|
"reward": 0.6661410629749298, |
|
"reward_std": 0.20661123096942902, |
|
"rewards/improved_len_reward_dast": 0.6661410629749298, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1476.1377563476562, |
|
"epoch": 0.5262475696694751, |
|
"grad_norm": 0.13691206276416779, |
|
"kl": 0.0115814208984375, |
|
"learning_rate": 5.867320642364916e-07, |
|
"loss": -0.0011, |
|
"reward": 0.6029430329799652, |
|
"reward_std": 0.2665823772549629, |
|
"rewards/improved_len_reward_dast": 0.6029430329799652, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 1761.4412841796875, |
|
"epoch": 0.5288399222294232, |
|
"grad_norm": 0.16101093590259552, |
|
"kl": 0.0147247314453125, |
|
"learning_rate": 5.826583544220678e-07, |
|
"loss": -0.003, |
|
"reward": 0.4686589315533638, |
|
"reward_std": 0.30455850437283516, |
|
"rewards/improved_len_reward_dast": 0.4686589315533638, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 1345.9336395263672, |
|
"epoch": 0.5314322747893714, |
|
"grad_norm": 0.18765667080879211, |
|
"kl": 0.0143280029296875, |
|
"learning_rate": 5.78581952212107e-07, |
|
"loss": 0.0427, |
|
"reward": 0.5415176302194595, |
|
"reward_std": 0.1892006602138281, |
|
"rewards/improved_len_reward_dast": 0.5415176302194595, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 1723.2882385253906, |
|
"epoch": 0.5340246273493195, |
|
"grad_norm": 0.16698125004768372, |
|
"kl": 0.015594482421875, |
|
"learning_rate": 5.745031936702997e-07, |
|
"loss": 0.0169, |
|
"reward": 0.5265255123376846, |
|
"reward_std": 0.21307621523737907, |
|
"rewards/improved_len_reward_dast": 0.5265255123376846, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1802.2014770507812, |
|
"epoch": 0.5366169799092677, |
|
"grad_norm": 0.1614968627691269, |
|
"kl": 0.019561767578125, |
|
"learning_rate": 5.704224150545956e-07, |
|
"loss": 0.0221, |
|
"reward": 0.4998108521103859, |
|
"reward_std": 0.2493179477751255, |
|
"rewards/improved_len_reward_dast": 0.4998108521103859, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 1295.1887664794922, |
|
"epoch": 0.5392093324692158, |
|
"grad_norm": 0.17712561786174774, |
|
"kl": 0.01239013671875, |
|
"learning_rate": 5.663399527894816e-07, |
|
"loss": 0.0241, |
|
"reward": 0.6530888006091118, |
|
"reward_std": 0.20925156585872173, |
|
"rewards/improved_len_reward_dast": 0.6530888006091118, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 1494.2805786132812, |
|
"epoch": 0.5418016850291639, |
|
"grad_norm": 0.13745726644992828, |
|
"kl": 0.013885498046875, |
|
"learning_rate": 5.622561434382467e-07, |
|
"loss": 0.0127, |
|
"reward": 0.48835258930921555, |
|
"reward_std": 0.29268738254904747, |
|
"rewards/improved_len_reward_dast": 0.48835258930921555, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 2105.05859375, |
|
"epoch": 0.5443940375891121, |
|
"grad_norm": 0.16275277733802795, |
|
"kl": 0.019744873046875, |
|
"learning_rate": 5.581713236752361e-07, |
|
"loss": -0.0125, |
|
"reward": 0.49209489673376083, |
|
"reward_std": 0.22952783107757568, |
|
"rewards/improved_len_reward_dast": 0.49209489673376083, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 1500.5382385253906, |
|
"epoch": 0.5469863901490603, |
|
"grad_norm": 0.1682814359664917, |
|
"kl": 0.015899658203125, |
|
"learning_rate": 5.540858302580934e-07, |
|
"loss": 0.0207, |
|
"reward": 0.5571364462375641, |
|
"reward_std": 0.24001171812415123, |
|
"rewards/improved_len_reward_dast": 0.5571364462375641, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 1923.9718933105469, |
|
"epoch": 0.5495787427090084, |
|
"grad_norm": 0.26020315289497375, |
|
"kl": 0.0178680419921875, |
|
"learning_rate": 5.5e-07, |
|
"loss": -0.0687, |
|
"reward": 0.46721208840608597, |
|
"reward_std": 0.2817242816090584, |
|
"rewards/improved_len_reward_dast": 0.46721208840608597, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1559.1249694824219, |
|
"epoch": 0.5521710952689566, |
|
"grad_norm": 0.19159933924674988, |
|
"kl": 0.0157012939453125, |
|
"learning_rate": 5.459141697419066e-07, |
|
"loss": -0.0163, |
|
"reward": 0.6227085031569004, |
|
"reward_std": 0.17799550667405128, |
|
"rewards/improved_len_reward_dast": 0.6227085031569004, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 1373.073959350586, |
|
"epoch": 0.5547634478289047, |
|
"grad_norm": 0.13028506934642792, |
|
"kl": 0.0119476318359375, |
|
"learning_rate": 5.418286763247641e-07, |
|
"loss": 0.0155, |
|
"reward": 0.6422765105962753, |
|
"reward_std": 0.20027055218815804, |
|
"rewards/improved_len_reward_dast": 0.6422765105962753, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 1835.0841369628906, |
|
"epoch": 0.5573558003888529, |
|
"grad_norm": 0.19164609909057617, |
|
"kl": 0.02093505859375, |
|
"learning_rate": 5.377438565617532e-07, |
|
"loss": -0.0, |
|
"reward": 0.45971810445189476, |
|
"reward_std": 0.3102139085531235, |
|
"rewards/improved_len_reward_dast": 0.45971810445189476, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 2203.964324951172, |
|
"epoch": 0.5599481529488011, |
|
"grad_norm": 0.20495197176933289, |
|
"kl": 0.023651123046875, |
|
"learning_rate": 5.336600472105186e-07, |
|
"loss": 0.0257, |
|
"reward": 0.46121083945035934, |
|
"reward_std": 0.2739550843834877, |
|
"rewards/improved_len_reward_dast": 0.46121083945035934, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 1694.767822265625, |
|
"epoch": 0.5625405055087492, |
|
"grad_norm": 0.17180804908275604, |
|
"kl": 0.0158233642578125, |
|
"learning_rate": 5.295775849454045e-07, |
|
"loss": 0.0214, |
|
"reward": 0.4882684126496315, |
|
"reward_std": 0.18393072485923767, |
|
"rewards/improved_len_reward_dast": 0.4882684126496315, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 2042.6683044433594, |
|
"epoch": 0.5651328580686974, |
|
"grad_norm": 0.14189192652702332, |
|
"kl": 0.0199127197265625, |
|
"learning_rate": 5.254968063297003e-07, |
|
"loss": 0.0033, |
|
"reward": 0.4571044594049454, |
|
"reward_std": 0.2295171208679676, |
|
"rewards/improved_len_reward_dast": 0.4571044594049454, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 1910.2091064453125, |
|
"epoch": 0.5677252106286454, |
|
"grad_norm": 0.1559721827507019, |
|
"kl": 0.0175628662109375, |
|
"learning_rate": 5.214180477878931e-07, |
|
"loss": 0.0287, |
|
"reward": 0.5590897053480148, |
|
"reward_std": 0.24075813218951225, |
|
"rewards/improved_len_reward_dast": 0.5590897053480148, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 2258.568878173828, |
|
"epoch": 0.5703175631885936, |
|
"grad_norm": 0.14663958549499512, |
|
"kl": 0.0192718505859375, |
|
"learning_rate": 5.173416455779323e-07, |
|
"loss": 0.0057, |
|
"reward": 0.5018766671419144, |
|
"reward_std": 0.26441601663827896, |
|
"rewards/improved_len_reward_dast": 0.5018766671419144, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 1572.7805786132812, |
|
"epoch": 0.5729099157485418, |
|
"grad_norm": 0.18465618789196014, |
|
"kl": 0.0140838623046875, |
|
"learning_rate": 5.132679357635086e-07, |
|
"loss": 0.0051, |
|
"reward": 0.503139078617096, |
|
"reward_std": 0.2186896838247776, |
|
"rewards/improved_len_reward_dast": 0.503139078617096, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 1802.676025390625, |
|
"epoch": 0.5755022683084899, |
|
"grad_norm": 0.14844530820846558, |
|
"kl": 0.0166778564453125, |
|
"learning_rate": 5.091972541863481e-07, |
|
"loss": 0.0324, |
|
"reward": 0.5386637449264526, |
|
"reward_std": 0.23858756944537163, |
|
"rewards/improved_len_reward_dast": 0.5386637449264526, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 1240.0254821777344, |
|
"epoch": 0.5780946208684381, |
|
"grad_norm": 0.21698173880577087, |
|
"kl": 0.01102447509765625, |
|
"learning_rate": 5.051299364385257e-07, |
|
"loss": 0.0494, |
|
"reward": 0.6127595007419586, |
|
"reward_std": 0.22652167454361916, |
|
"rewards/improved_len_reward_dast": 0.6127595007419586, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 2182.4820861816406, |
|
"epoch": 0.5806869734283863, |
|
"grad_norm": 0.17258256673812866, |
|
"kl": 0.022186279296875, |
|
"learning_rate": 5.010663178347971e-07, |
|
"loss": 0.0347, |
|
"reward": 0.538253664970398, |
|
"reward_std": 0.2642097547650337, |
|
"rewards/improved_len_reward_dast": 0.538253664970398, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1501.5688171386719, |
|
"epoch": 0.5832793259883344, |
|
"grad_norm": 0.1956610530614853, |
|
"kl": 0.016021728515625, |
|
"learning_rate": 4.970067333849568e-07, |
|
"loss": 0.0345, |
|
"reward": 0.49669354408979416, |
|
"reward_std": 0.27108582854270935, |
|
"rewards/improved_len_reward_dast": 0.49669354408979416, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 1730.1020202636719, |
|
"epoch": 0.5858716785482826, |
|
"grad_norm": 0.207245334982872, |
|
"kl": 0.019317626953125, |
|
"learning_rate": 4.929515177662182e-07, |
|
"loss": 0.0365, |
|
"reward": 0.5905517414212227, |
|
"reward_std": 0.19489995390176773, |
|
"rewards/improved_len_reward_dast": 0.5905517414212227, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 1841.9719543457031, |
|
"epoch": 0.5884640311082308, |
|
"grad_norm": 0.15803495049476624, |
|
"kl": 0.0171661376953125, |
|
"learning_rate": 4.889010052956233e-07, |
|
"loss": 0.0201, |
|
"reward": 0.5235611572861671, |
|
"reward_std": 0.28989996388554573, |
|
"rewards/improved_len_reward_dast": 0.5235611572861671, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1340.9795837402344, |
|
"epoch": 0.5910563836681789, |
|
"grad_norm": 0.14726755023002625, |
|
"kl": 0.0139312744140625, |
|
"learning_rate": 4.848555299024798e-07, |
|
"loss": 0.0361, |
|
"reward": 0.6701846867799759, |
|
"reward_std": 0.1808851771056652, |
|
"rewards/improved_len_reward_dast": 0.6701846867799759, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 1694.4412841796875, |
|
"epoch": 0.593648736228127, |
|
"grad_norm": 0.1517164260149002, |
|
"kl": 0.0196075439453125, |
|
"learning_rate": 4.80815425100833e-07, |
|
"loss": 0.0279, |
|
"reward": 0.5193780064582825, |
|
"reward_std": 0.2600921764969826, |
|
"rewards/improved_len_reward_dast": 0.5193780064582825, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 1846.6199035644531, |
|
"epoch": 0.5962410887880751, |
|
"grad_norm": 0.14960968494415283, |
|
"kl": 0.021148681640625, |
|
"learning_rate": 4.7678102396196983e-07, |
|
"loss": 0.0065, |
|
"reward": 0.5484226644039154, |
|
"reward_std": 0.2323933281004429, |
|
"rewards/improved_len_reward_dast": 0.5484226644039154, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1674.3673400878906, |
|
"epoch": 0.5988334413480233, |
|
"grad_norm": 0.17465737462043762, |
|
"kl": 0.0171966552734375, |
|
"learning_rate": 4.727526590869605e-07, |
|
"loss": 0.0101, |
|
"reward": 0.4662090986967087, |
|
"reward_std": 0.21839703619480133, |
|
"rewards/improved_len_reward_dast": 0.4662090986967087, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1509.1658020019531, |
|
"epoch": 0.6014257939079715, |
|
"grad_norm": 0.2101481705904007, |
|
"kl": 0.0194244384765625, |
|
"learning_rate": 4.6873066257923735e-07, |
|
"loss": 0.0111, |
|
"reward": 0.41205941140651703, |
|
"reward_std": 0.21029997244477272, |
|
"rewards/improved_len_reward_dast": 0.41205941140651703, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 2161.2933044433594, |
|
"epoch": 0.6040181464679196, |
|
"grad_norm": 0.15703819692134857, |
|
"kl": 0.02520751953125, |
|
"learning_rate": 4.647153660172173e-07, |
|
"loss": 0.0004, |
|
"reward": 0.48261498659849167, |
|
"reward_std": 0.23572781309485435, |
|
"rewards/improved_len_reward_dast": 0.48261498659849167, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 2022.1300354003906, |
|
"epoch": 0.6066104990278678, |
|
"grad_norm": 0.26027029752731323, |
|
"kl": 0.027191162109375, |
|
"learning_rate": 4.607071004269647e-07, |
|
"loss": 0.0521, |
|
"reward": 0.5801032036542892, |
|
"reward_std": 0.2363697662949562, |
|
"rewards/improved_len_reward_dast": 0.5801032036542892, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 1732.1275329589844, |
|
"epoch": 0.609202851587816, |
|
"grad_norm": 0.16403397917747498, |
|
"kl": 0.023712158203125, |
|
"learning_rate": 4.567061962549025e-07, |
|
"loss": 0.0162, |
|
"reward": 0.5604917779564857, |
|
"reward_std": 0.22574709728360176, |
|
"rewards/improved_len_reward_dast": 0.5604917779564857, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 1861.7398071289062, |
|
"epoch": 0.6117952041477641, |
|
"grad_norm": 0.17224401235580444, |
|
"kl": 0.02764892578125, |
|
"learning_rate": 4.527129833405687e-07, |
|
"loss": -0.0105, |
|
"reward": 0.5203966200351715, |
|
"reward_std": 0.24152075126767159, |
|
"rewards/improved_len_reward_dast": 0.5203966200351715, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1547.3724365234375, |
|
"epoch": 0.6143875567077123, |
|
"grad_norm": 0.23777875304222107, |
|
"kl": 0.0208740234375, |
|
"learning_rate": 4.4872779088942425e-07, |
|
"loss": 0.0271, |
|
"reward": 0.5496758297085762, |
|
"reward_std": 0.27215462550520897, |
|
"rewards/improved_len_reward_dast": 0.5496758297085762, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 1969.2167663574219, |
|
"epoch": 0.6169799092676604, |
|
"grad_norm": 0.17101754248142242, |
|
"kl": 0.028717041015625, |
|
"learning_rate": 4.447509474457135e-07, |
|
"loss": 0.0279, |
|
"reward": 0.614069253206253, |
|
"reward_std": 0.22787783294916153, |
|
"rewards/improved_len_reward_dast": 0.614069253206253, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 1983.8570861816406, |
|
"epoch": 0.6195722618276086, |
|
"grad_norm": 0.14745572209358215, |
|
"kl": 0.027984619140625, |
|
"learning_rate": 4.4078278086537823e-07, |
|
"loss": 0.0075, |
|
"reward": 0.5620970204472542, |
|
"reward_std": 0.28037280961871147, |
|
"rewards/improved_len_reward_dast": 0.5620970204472542, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 1501.4897766113281, |
|
"epoch": 0.6221646143875567, |
|
"grad_norm": 0.15315327048301697, |
|
"kl": 0.0208740234375, |
|
"learning_rate": 4.3682361828902846e-07, |
|
"loss": 0.0168, |
|
"reward": 0.5364163219928741, |
|
"reward_std": 0.2769155353307724, |
|
"rewards/improved_len_reward_dast": 0.5364163219928741, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1720.4004516601562, |
|
"epoch": 0.6247569669475048, |
|
"grad_norm": 0.2516409158706665, |
|
"kl": 0.025726318359375, |
|
"learning_rate": 4.328737861149726e-07, |
|
"loss": -0.0237, |
|
"reward": 0.4434478208422661, |
|
"reward_std": 0.27931295707821846, |
|
"rewards/improved_len_reward_dast": 0.4434478208422661, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1809.1555633544922, |
|
"epoch": 0.627349319507453, |
|
"grad_norm": 0.16826176643371582, |
|
"kl": 0.0258331298828125, |
|
"learning_rate": 4.289336099723098e-07, |
|
"loss": 0.0143, |
|
"reward": 0.5690242052078247, |
|
"reward_std": 0.23354141414165497, |
|
"rewards/improved_len_reward_dast": 0.5690242052078247, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 1983.2984313964844, |
|
"epoch": 0.6299416720674011, |
|
"grad_norm": 0.19760891795158386, |
|
"kl": 0.0326995849609375, |
|
"learning_rate": 4.250034146940834e-07, |
|
"loss": 0.0445, |
|
"reward": 0.5676752850413322, |
|
"reward_std": 0.24079378694295883, |
|
"rewards/improved_len_reward_dast": 0.5676752850413322, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 1628.5739440917969, |
|
"epoch": 0.6325340246273493, |
|
"grad_norm": 0.16681502759456635, |
|
"kl": 0.0252532958984375, |
|
"learning_rate": 4.210835242905023e-07, |
|
"loss": 0.023, |
|
"reward": 0.5827814638614655, |
|
"reward_std": 0.23568623140454292, |
|
"rewards/improved_len_reward_dast": 0.5827814638614655, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1828.3596496582031, |
|
"epoch": 0.6351263771872975, |
|
"grad_norm": 0.18224835395812988, |
|
"kl": 0.032196044921875, |
|
"learning_rate": 4.1717426192222784e-07, |
|
"loss": 0.0288, |
|
"reward": 0.5939928889274597, |
|
"reward_std": 0.19402909092605114, |
|
"rewards/improved_len_reward_dast": 0.5939928889274597, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 2015.790771484375, |
|
"epoch": 0.6377187297472456, |
|
"grad_norm": 0.17992916703224182, |
|
"kl": 0.03424072265625, |
|
"learning_rate": 4.1327594987373347e-07, |
|
"loss": -0.0046, |
|
"reward": 0.4280674587935209, |
|
"reward_std": 0.23897960036993027, |
|
"rewards/improved_len_reward_dast": 0.4280674587935209, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 2058.4132385253906, |
|
"epoch": 0.6403110823071938, |
|
"grad_norm": 0.16108137369155884, |
|
"kl": 0.032012939453125, |
|
"learning_rate": 4.0938890952673443e-07, |
|
"loss": 0.0148, |
|
"reward": 0.4607112519443035, |
|
"reward_std": 0.19015729054808617, |
|
"rewards/improved_len_reward_dast": 0.4607112519443035, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 1944.2117004394531, |
|
"epoch": 0.642903434867142, |
|
"grad_norm": 0.17802801728248596, |
|
"kl": 0.03271484375, |
|
"learning_rate": 4.05513461333693e-07, |
|
"loss": -0.017, |
|
"reward": 0.47688183188438416, |
|
"reward_std": 0.2750718258321285, |
|
"rewards/improved_len_reward_dast": 0.47688183188438416, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1828.591796875, |
|
"epoch": 0.6454957874270901, |
|
"grad_norm": 0.2043733447790146, |
|
"kl": 0.0340576171875, |
|
"learning_rate": 4.016499247913994e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5244659259915352, |
|
"reward_std": 0.22990256920456886, |
|
"rewards/improved_len_reward_dast": 0.5244659259915352, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1883.8673095703125, |
|
"epoch": 0.6480881399870383, |
|
"grad_norm": 0.1844264417886734, |
|
"kl": 0.032318115234375, |
|
"learning_rate": 3.977986184146328e-07, |
|
"loss": 0.0399, |
|
"reward": 0.5821568816900253, |
|
"reward_std": 0.23736536875367165, |
|
"rewards/improved_len_reward_dast": 0.5821568816900253, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 2136.216796875, |
|
"epoch": 0.6506804925469863, |
|
"grad_norm": 0.23143452405929565, |
|
"kl": 0.03753662109375, |
|
"learning_rate": 3.939598597099022e-07, |
|
"loss": -0.0036, |
|
"reward": 0.4462605491280556, |
|
"reward_std": 0.31835515797138214, |
|
"rewards/improved_len_reward_dast": 0.4462605491280556, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1681.5816040039062, |
|
"epoch": 0.6532728451069345, |
|
"grad_norm": 0.21353423595428467, |
|
"kl": 0.030914306640625, |
|
"learning_rate": 3.9013396514927076e-07, |
|
"loss": -0.0203, |
|
"reward": 0.47858475893735886, |
|
"reward_std": 0.27276671305298805, |
|
"rewards/improved_len_reward_dast": 0.47858475893735886, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 1958.6862182617188, |
|
"epoch": 0.6558651976668827, |
|
"grad_norm": 0.13848742842674255, |
|
"kl": 0.02813720703125, |
|
"learning_rate": 3.8632125014426566e-07, |
|
"loss": -0.0093, |
|
"reward": 0.4058891125023365, |
|
"reward_std": 0.2632647715508938, |
|
"rewards/improved_len_reward_dast": 0.4058891125023365, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 2378.114715576172, |
|
"epoch": 0.6584575502268308, |
|
"grad_norm": 0.2316746562719345, |
|
"kl": 0.04150390625, |
|
"learning_rate": 3.8252202901987474e-07, |
|
"loss": 0.0253, |
|
"reward": 0.4882591739296913, |
|
"reward_std": 0.23233528062701225, |
|
"rewards/improved_len_reward_dast": 0.4882591739296913, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 1957.6759643554688, |
|
"epoch": 0.661049902786779, |
|
"grad_norm": 0.25321757793426514, |
|
"kl": 0.036376953125, |
|
"learning_rate": 3.7873661498863384e-07, |
|
"loss": -0.0147, |
|
"reward": 0.5560312643647194, |
|
"reward_std": 0.25412745028734207, |
|
"rewards/improved_len_reward_dast": 0.5560312643647194, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 1821.5433349609375, |
|
"epoch": 0.6636422553467272, |
|
"grad_norm": 0.18754082918167114, |
|
"kl": 0.033843994140625, |
|
"learning_rate": 3.7496532012480463e-07, |
|
"loss": 0.0354, |
|
"reward": 0.5669542029500008, |
|
"reward_std": 0.21794036030769348, |
|
"rewards/improved_len_reward_dast": 0.5669542029500008, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 1623.8086547851562, |
|
"epoch": 0.6662346079066753, |
|
"grad_norm": 0.23035286366939545, |
|
"kl": 0.032135009765625, |
|
"learning_rate": 3.7120845533864706e-07, |
|
"loss": 0.0102, |
|
"reward": 0.5488623678684235, |
|
"reward_std": 0.25401103869080544, |
|
"rewards/improved_len_reward_dast": 0.5488623678684235, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 2070.8800659179688, |
|
"epoch": 0.6688269604666235, |
|
"grad_norm": 0.22188010811805725, |
|
"kl": 0.039306640625, |
|
"learning_rate": 3.6746633035078723e-07, |
|
"loss": -0.0121, |
|
"reward": 0.417112834751606, |
|
"reward_std": 0.21221662312746048, |
|
"rewards/improved_len_reward_dast": 0.417112834751606, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 1561.369873046875, |
|
"epoch": 0.6714193130265717, |
|
"grad_norm": 0.17679612338542938, |
|
"kl": 0.029083251953125, |
|
"learning_rate": 3.63739253666684e-07, |
|
"loss": 0.0025, |
|
"reward": 0.5591792389750481, |
|
"reward_std": 0.23877732083201408, |
|
"rewards/improved_len_reward_dast": 0.5591792389750481, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 2174.785675048828, |
|
"epoch": 0.6740116655865198, |
|
"grad_norm": 0.27692729234695435, |
|
"kl": 0.037506103515625, |
|
"learning_rate": 3.6002753255119533e-07, |
|
"loss": 0.0435, |
|
"reward": 0.5408740267157555, |
|
"reward_std": 0.3040400817990303, |
|
"rewards/improved_len_reward_dast": 0.5408740267157555, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1679.0331115722656, |
|
"epoch": 0.6766040181464679, |
|
"grad_norm": 0.2655293047428131, |
|
"kl": 0.0357513427734375, |
|
"learning_rate": 3.5633147300324706e-07, |
|
"loss": 0.0546, |
|
"reward": 0.5232817307114601, |
|
"reward_std": 0.19237679801881313, |
|
"rewards/improved_len_reward_dast": 0.5232817307114601, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 2010.9719543457031, |
|
"epoch": 0.679196370706416, |
|
"grad_norm": 0.16808085143566132, |
|
"kl": 0.038818359375, |
|
"learning_rate": 3.526513797306051e-07, |
|
"loss": 0.0302, |
|
"reward": 0.5936213284730911, |
|
"reward_std": 0.19600553810596466, |
|
"rewards/improved_len_reward_dast": 0.5936213284730911, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 2130.2651977539062, |
|
"epoch": 0.6817887232663642, |
|
"grad_norm": 0.2609139382839203, |
|
"kl": 0.05194091796875, |
|
"learning_rate": 3.489875561247568e-07, |
|
"loss": 0.0577, |
|
"reward": 0.5483756810426712, |
|
"reward_std": 0.19722291082143784, |
|
"rewards/improved_len_reward_dast": 0.5483756810426712, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 1698.2474365234375, |
|
"epoch": 0.6843810758263124, |
|
"grad_norm": 0.20713907480239868, |
|
"kl": 0.035247802734375, |
|
"learning_rate": 3.453403042358968e-07, |
|
"loss": 0.0219, |
|
"reward": 0.591205969452858, |
|
"reward_std": 0.2349410019814968, |
|
"rewards/improved_len_reward_dast": 0.591205969452858, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 1852.9157409667969, |
|
"epoch": 0.6869734283862605, |
|
"grad_norm": 0.1937071830034256, |
|
"kl": 0.0416259765625, |
|
"learning_rate": 3.417099247480277e-07, |
|
"loss": 0.0277, |
|
"reward": 0.5075777247548103, |
|
"reward_std": 0.2149694226682186, |
|
"rewards/improved_len_reward_dast": 0.5075777247548103, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 2136.966766357422, |
|
"epoch": 0.6895657809462087, |
|
"grad_norm": 0.2680908441543579, |
|
"kl": 0.0557861328125, |
|
"learning_rate": 3.3809671695416916e-07, |
|
"loss": 0.0296, |
|
"reward": 0.5373198315501213, |
|
"reward_std": 0.2887946330010891, |
|
"rewards/improved_len_reward_dast": 0.5373198315501213, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 1520.6912841796875, |
|
"epoch": 0.6921581335061568, |
|
"grad_norm": 0.21836334466934204, |
|
"kl": 0.03875732421875, |
|
"learning_rate": 3.345009787316859e-07, |
|
"loss": 0.0328, |
|
"reward": 0.6441917270421982, |
|
"reward_std": 0.20870398730039597, |
|
"rewards/improved_len_reward_dast": 0.6441917270421982, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 1714.9182891845703, |
|
"epoch": 0.694750486066105, |
|
"grad_norm": 0.2412445843219757, |
|
"kl": 0.045013427734375, |
|
"learning_rate": 3.309230065177289e-07, |
|
"loss": 0.0413, |
|
"reward": 0.6430572420358658, |
|
"reward_std": 0.1806990448385477, |
|
"rewards/improved_len_reward_dast": 0.6430572420358658, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 1760.8800354003906, |
|
"epoch": 0.6973428386260532, |
|
"grad_norm": 0.22807085514068604, |
|
"kl": 0.05609130859375, |
|
"learning_rate": 3.273630952847971e-07, |
|
"loss": 0.0423, |
|
"reward": 0.5424732938408852, |
|
"reward_std": 0.2143792137503624, |
|
"rewards/improved_len_reward_dast": 0.5424732938408852, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 1853.6504821777344, |
|
"epoch": 0.6999351911860013, |
|
"grad_norm": 0.181836798787117, |
|
"kl": 0.0499267578125, |
|
"learning_rate": 3.2382153851641996e-07, |
|
"loss": 0.0181, |
|
"reward": 0.444052018225193, |
|
"reward_std": 0.18882366083562374, |
|
"rewards/improved_len_reward_dast": 0.444052018225193, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1881.66064453125, |
|
"epoch": 0.7025275437459495, |
|
"grad_norm": 0.27354660630226135, |
|
"kl": 0.068359375, |
|
"learning_rate": 3.202986281829616e-07, |
|
"loss": 0.0349, |
|
"reward": 0.517607145011425, |
|
"reward_std": 0.27927474305033684, |
|
"rewards/improved_len_reward_dast": 0.517607145011425, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 1876.6555480957031, |
|
"epoch": 0.7051198963058976, |
|
"grad_norm": 0.28521379828453064, |
|
"kl": 0.05859375, |
|
"learning_rate": 3.1679465471755106e-07, |
|
"loss": 0.0206, |
|
"reward": 0.4820089340209961, |
|
"reward_std": 0.23564638569951057, |
|
"rewards/improved_len_reward_dast": 0.4820089340209961, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 2040.4591979980469, |
|
"epoch": 0.7077122488658457, |
|
"grad_norm": 0.23997171223163605, |
|
"kl": 0.08282470703125, |
|
"learning_rate": 3.1330990699213824e-07, |
|
"loss": 0.0225, |
|
"reward": 0.5280723571777344, |
|
"reward_std": 0.19220414385199547, |
|
"rewards/improved_len_reward_dast": 0.5280723571777344, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 1857.6989440917969, |
|
"epoch": 0.7103046014257939, |
|
"grad_norm": 0.43727678060531616, |
|
"kl": 0.06512451171875, |
|
"learning_rate": 3.0984467229367885e-07, |
|
"loss": -0.0214, |
|
"reward": 0.4374289773404598, |
|
"reward_std": 0.24020638316869736, |
|
"rewards/improved_len_reward_dast": 0.4374289773404598, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 1751.9540100097656, |
|
"epoch": 0.712896953985742, |
|
"grad_norm": 0.3334012031555176, |
|
"kl": 0.06158447265625, |
|
"learning_rate": 3.063992363004503e-07, |
|
"loss": 0.045, |
|
"reward": 0.560393676161766, |
|
"reward_std": 0.24407575279474258, |
|
"rewards/improved_len_reward_dast": 0.560393676161766, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 1921.1402282714844, |
|
"epoch": 0.7154893065456902, |
|
"grad_norm": 0.3832853436470032, |
|
"kl": 0.07147216796875, |
|
"learning_rate": 3.0297388305850004e-07, |
|
"loss": 0.0472, |
|
"reward": 0.4454130306839943, |
|
"reward_std": 0.2820102423429489, |
|
"rewards/improved_len_reward_dast": 0.4454130306839943, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1837.2091369628906, |
|
"epoch": 0.7180816591056384, |
|
"grad_norm": 0.3518737256526947, |
|
"kl": 0.0821533203125, |
|
"learning_rate": 2.9956889495822877e-07, |
|
"loss": 0.012, |
|
"reward": 0.4661199301481247, |
|
"reward_std": 0.27521887794137, |
|
"rewards/improved_len_reward_dast": 0.4661199301481247, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 1900.5535583496094, |
|
"epoch": 0.7206740116655865, |
|
"grad_norm": 0.43427199125289917, |
|
"kl": 0.0863037109375, |
|
"learning_rate": 2.961845527111091e-07, |
|
"loss": 0.023, |
|
"reward": 0.49635138362646103, |
|
"reward_std": 0.19695542380213737, |
|
"rewards/improved_len_reward_dast": 0.49635138362646103, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1781.9693603515625, |
|
"epoch": 0.7232663642255347, |
|
"grad_norm": 0.4264023005962372, |
|
"kl": 0.0863037109375, |
|
"learning_rate": 2.9282113532654363e-07, |
|
"loss": 0.0375, |
|
"reward": 0.5367333218455315, |
|
"reward_std": 0.2848246172070503, |
|
"rewards/improved_len_reward_dast": 0.5367333218455315, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 1870.1810607910156, |
|
"epoch": 0.7258587167854829, |
|
"grad_norm": 0.4594426155090332, |
|
"kl": 0.093994140625, |
|
"learning_rate": 2.894789200888634e-07, |
|
"loss": 0.0754, |
|
"reward": 0.5816426277160645, |
|
"reward_std": 0.22311532869935036, |
|
"rewards/improved_len_reward_dast": 0.5816426277160645, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 1424.9234619140625, |
|
"epoch": 0.728451069345431, |
|
"grad_norm": 0.29910653829574585, |
|
"kl": 0.07781982421875, |
|
"learning_rate": 2.8615818253446766e-07, |
|
"loss": 0.0199, |
|
"reward": 0.6617397367954254, |
|
"reward_std": 0.2196519523859024, |
|
"rewards/improved_len_reward_dast": 0.6617397367954254, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 1413.9310913085938, |
|
"epoch": 0.7310434219053791, |
|
"grad_norm": 0.3942880928516388, |
|
"kl": 0.0758056640625, |
|
"learning_rate": 2.828591964291093e-07, |
|
"loss": 0.0087, |
|
"reward": 0.5287511795759201, |
|
"reward_std": 0.24336805939674377, |
|
"rewards/improved_len_reward_dast": 0.5287511795759201, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 1595.8367309570312, |
|
"epoch": 0.7336357744653272, |
|
"grad_norm": 0.3628610074520111, |
|
"kl": 0.07489013671875, |
|
"learning_rate": 2.7958223374532363e-07, |
|
"loss": -0.0144, |
|
"reward": 0.492396779358387, |
|
"reward_std": 0.28066498413681984, |
|
"rewards/improved_len_reward_dast": 0.492396779358387, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 1388.1887512207031, |
|
"epoch": 0.7362281270252754, |
|
"grad_norm": 0.463160902261734, |
|
"kl": 0.06719970703125, |
|
"learning_rate": 2.7632756464000835e-07, |
|
"loss": 0.0589, |
|
"reward": 0.6620587855577469, |
|
"reward_std": 0.20764853432774544, |
|
"rewards/improved_len_reward_dast": 0.6620587855577469, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 2089.030517578125, |
|
"epoch": 0.7388204795852236, |
|
"grad_norm": 0.416213721036911, |
|
"kl": 0.13720703125, |
|
"learning_rate": 2.730954574321503e-07, |
|
"loss": 0.0295, |
|
"reward": 0.35152027755975723, |
|
"reward_std": 0.24749820679426193, |
|
"rewards/improved_len_reward_dast": 0.35152027755975723, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 1603.8214111328125, |
|
"epoch": 0.7414128321451717, |
|
"grad_norm": 0.2952825725078583, |
|
"kl": 0.1046142578125, |
|
"learning_rate": 2.698861785807055e-07, |
|
"loss": 0.0192, |
|
"reward": 0.5497759729623795, |
|
"reward_std": 0.30123934894800186, |
|
"rewards/improved_len_reward_dast": 0.5497759729623795, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 1780.3035583496094, |
|
"epoch": 0.7440051847051199, |
|
"grad_norm": 0.49455901980400085, |
|
"kl": 0.1126708984375, |
|
"learning_rate": 2.6669999266263154e-07, |
|
"loss": -0.0016, |
|
"reward": 0.45189622789621353, |
|
"reward_std": 0.2775086760520935, |
|
"rewards/improved_len_reward_dast": 0.45189622789621353, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 1889.9999389648438, |
|
"epoch": 0.7465975372650681, |
|
"grad_norm": 0.3843936324119568, |
|
"kl": 0.11334228515625, |
|
"learning_rate": 2.635371623510758e-07, |
|
"loss": 0.0216, |
|
"reward": 0.4534267857670784, |
|
"reward_std": 0.24446595832705498, |
|
"rewards/improved_len_reward_dast": 0.4534267857670784, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 1606.8596649169922, |
|
"epoch": 0.7491898898250162, |
|
"grad_norm": 0.3675477206707001, |
|
"kl": 0.1318359375, |
|
"learning_rate": 2.6039794839372066e-07, |
|
"loss": 0.0298, |
|
"reward": 0.5441700667142868, |
|
"reward_std": 0.23767928034067154, |
|
"rewards/improved_len_reward_dast": 0.5441700667142868, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 1844.4744567871094, |
|
"epoch": 0.7517822423849644, |
|
"grad_norm": 0.3915606141090393, |
|
"kl": 0.1278076171875, |
|
"learning_rate": 2.5728260959128614e-07, |
|
"loss": 0.01, |
|
"reward": 0.44573093950748444, |
|
"reward_std": 0.2749871090054512, |
|
"rewards/improved_len_reward_dast": 0.44573093950748444, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 2282.3596801757812, |
|
"epoch": 0.7543745949449125, |
|
"grad_norm": 0.4575667977333069, |
|
"kl": 0.18359375, |
|
"learning_rate": 2.541914027761951e-07, |
|
"loss": 0.0404, |
|
"reward": 0.4888821840286255, |
|
"reward_std": 0.25494180247187614, |
|
"rewards/improved_len_reward_dast": 0.4888821840286255, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 1613.5662994384766, |
|
"epoch": 0.7569669475048607, |
|
"grad_norm": 0.30206117033958435, |
|
"kl": 0.128662109375, |
|
"learning_rate": 2.511245827913991e-07, |
|
"loss": 0.0256, |
|
"reward": 0.49646422639489174, |
|
"reward_std": 0.22853870689868927, |
|
"rewards/improved_len_reward_dast": 0.49646422639489174, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 1986.2856750488281, |
|
"epoch": 0.7595593000648088, |
|
"grad_norm": 0.5634958148002625, |
|
"kl": 0.15277099609375, |
|
"learning_rate": 2.4808240246936866e-07, |
|
"loss": -0.0016, |
|
"reward": 0.422063373029232, |
|
"reward_std": 0.27426348254084587, |
|
"rewards/improved_len_reward_dast": 0.422063373029232, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 1912.8239440917969, |
|
"epoch": 0.7621516526247569, |
|
"grad_norm": 0.649374783039093, |
|
"kl": 0.14990234375, |
|
"learning_rate": 2.450651126112504e-07, |
|
"loss": 0.0626, |
|
"reward": 0.5210420861840248, |
|
"reward_std": 0.1957332007586956, |
|
"rewards/improved_len_reward_dast": 0.5210420861840248, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 1617.318832397461, |
|
"epoch": 0.7647440051847051, |
|
"grad_norm": 0.4732544720172882, |
|
"kl": 0.13031005859375, |
|
"learning_rate": 2.4207296196618924e-07, |
|
"loss": 0.0492, |
|
"reward": 0.5095237344503403, |
|
"reward_std": 0.18864410370588303, |
|
"rewards/improved_len_reward_dast": 0.5095237344503403, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 1087.5356903076172, |
|
"epoch": 0.7673363577446533, |
|
"grad_norm": 0.2905370593070984, |
|
"kl": 0.0693359375, |
|
"learning_rate": 2.3910619721082253e-07, |
|
"loss": 0.0114, |
|
"reward": 0.6024390161037445, |
|
"reward_std": 0.22132978588342667, |
|
"rewards/improved_len_reward_dast": 0.6024390161037445, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 1602.8443603515625, |
|
"epoch": 0.7699287103046014, |
|
"grad_norm": 0.3845907151699066, |
|
"kl": 0.1500244140625, |
|
"learning_rate": 2.3616506292894282e-07, |
|
"loss": 0.0355, |
|
"reward": 0.5333931297063828, |
|
"reward_std": 0.3087846711277962, |
|
"rewards/improved_len_reward_dast": 0.5333931297063828, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 1856.6402587890625, |
|
"epoch": 0.7725210628645496, |
|
"grad_norm": 0.5214760899543762, |
|
"kl": 0.2025146484375, |
|
"learning_rate": 2.332498015913344e-07, |
|
"loss": 0.0709, |
|
"reward": 0.5482401996850967, |
|
"reward_std": 0.21799317747354507, |
|
"rewards/improved_len_reward_dast": 0.5482401996850967, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 1488.5025329589844, |
|
"epoch": 0.7751134154244977, |
|
"grad_norm": 0.43935874104499817, |
|
"kl": 0.186279296875, |
|
"learning_rate": 2.303606535357843e-07, |
|
"loss": 0.0366, |
|
"reward": 0.5733404159545898, |
|
"reward_std": 0.21773215383291245, |
|
"rewards/improved_len_reward_dast": 0.5733404159545898, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 1722.0892486572266, |
|
"epoch": 0.7777057679844459, |
|
"grad_norm": 0.6731678247451782, |
|
"kl": 0.249267578125, |
|
"learning_rate": 2.2749785694726685e-07, |
|
"loss": 0.0367, |
|
"reward": 0.42770911008119583, |
|
"reward_std": 0.2604576535522938, |
|
"rewards/improved_len_reward_dast": 0.42770911008119583, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 1631.4693603515625, |
|
"epoch": 0.7802981205443941, |
|
"grad_norm": 0.5669419765472412, |
|
"kl": 0.2646484375, |
|
"learning_rate": 2.2466164783830972e-07, |
|
"loss": 0.0641, |
|
"reward": 0.4630614146590233, |
|
"reward_std": 0.23011888936161995, |
|
"rewards/improved_len_reward_dast": 0.4630614146590233, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 2058.1912536621094, |
|
"epoch": 0.7828904731043422, |
|
"grad_norm": 1.057323694229126, |
|
"kl": 0.357421875, |
|
"learning_rate": 2.2185226002953483e-07, |
|
"loss": 0.0775, |
|
"reward": 0.37563329190015793, |
|
"reward_std": 0.2304530180990696, |
|
"rewards/improved_len_reward_dast": 0.37563329190015793, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 2198.5254516601562, |
|
"epoch": 0.7854828256642904, |
|
"grad_norm": 0.8592402338981628, |
|
"kl": 0.4775390625, |
|
"learning_rate": 2.1906992513038268e-07, |
|
"loss": 0.074, |
|
"reward": 0.27012719213962555, |
|
"reward_std": 0.28526439890265465, |
|
"rewards/improved_len_reward_dast": 0.27012719213962555, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 1901.4846496582031, |
|
"epoch": 0.7880751782242384, |
|
"grad_norm": 0.8431882858276367, |
|
"kl": 0.42138671875, |
|
"learning_rate": 2.1631487252001822e-07, |
|
"loss": 0.0492, |
|
"reward": 0.37709545344114304, |
|
"reward_std": 0.25447146967053413, |
|
"rewards/improved_len_reward_dast": 0.37709545344114304, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 2066.573974609375, |
|
"epoch": 0.7906675307841866, |
|
"grad_norm": 1.21139657497406, |
|
"kl": 0.4541015625, |
|
"learning_rate": 2.1358732932842032e-07, |
|
"loss": 0.0979, |
|
"reward": 0.31043257750570774, |
|
"reward_std": 0.24911593459546566, |
|
"rewards/improved_len_reward_dast": 0.31043257750570774, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 2245.244903564453, |
|
"epoch": 0.7932598833441348, |
|
"grad_norm": 1.6808669567108154, |
|
"kl": 0.705078125, |
|
"learning_rate": 2.1088752041765734e-07, |
|
"loss": 0.0566, |
|
"reward": 0.26184154860675335, |
|
"reward_std": 0.23590726777911186, |
|
"rewards/improved_len_reward_dast": 0.26184154860675335, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 1971.1198425292969, |
|
"epoch": 0.7958522359040829, |
|
"grad_norm": 1.2574543952941895, |
|
"kl": 0.59619140625, |
|
"learning_rate": 2.0821566836334847e-07, |
|
"loss": 0.0527, |
|
"reward": 0.32783937454223633, |
|
"reward_std": 0.24020928516983986, |
|
"rewards/improved_len_reward_dast": 0.32783937454223633, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 1597.5178527832031, |
|
"epoch": 0.7984445884640311, |
|
"grad_norm": 1.4473228454589844, |
|
"kl": 0.477294921875, |
|
"learning_rate": 2.0557199343631494e-07, |
|
"loss": 0.1143, |
|
"reward": 0.35970793664455414, |
|
"reward_std": 0.24855320155620575, |
|
"rewards/improved_len_reward_dast": 0.35970793664455414, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 2033.6224365234375, |
|
"epoch": 0.8010369410239793, |
|
"grad_norm": 1.6653419733047485, |
|
"kl": 0.60107421875, |
|
"learning_rate": 2.0295671358442033e-07, |
|
"loss": 0.0711, |
|
"reward": 0.32821540907025337, |
|
"reward_std": 0.2933182083070278, |
|
"rewards/improved_len_reward_dast": 0.32821540907025337, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 1561.040771484375, |
|
"epoch": 0.8036292935839274, |
|
"grad_norm": 1.075103759765625, |
|
"kl": 0.4736328125, |
|
"learning_rate": 2.0037004441460263e-07, |
|
"loss": 0.0643, |
|
"reward": 0.29576242342591286, |
|
"reward_std": 0.282806184142828, |
|
"rewards/improved_len_reward_dast": 0.29576242342591286, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 1718.183609008789, |
|
"epoch": 0.8062216461438756, |
|
"grad_norm": 1.504372477531433, |
|
"kl": 0.502197265625, |
|
"learning_rate": 1.9781219917509987e-07, |
|
"loss": 0.027, |
|
"reward": 0.3890268914401531, |
|
"reward_std": 0.19467511773109436, |
|
"rewards/improved_len_reward_dast": 0.3890268914401531, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 1843.2831420898438, |
|
"epoch": 0.8088139987038238, |
|
"grad_norm": 2.515340566635132, |
|
"kl": 0.6044921875, |
|
"learning_rate": 1.9528338873786882e-07, |
|
"loss": 0.0131, |
|
"reward": 0.2797587066888809, |
|
"reward_std": 0.2669145464897156, |
|
"rewards/improved_len_reward_dast": 0.2797587066888809, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 1955.7856750488281, |
|
"epoch": 0.8114063512637719, |
|
"grad_norm": 1.1451594829559326, |
|
"kl": 0.521728515625, |
|
"learning_rate": 1.9278382158120116e-07, |
|
"loss": 0.0705, |
|
"reward": 0.3171418644487858, |
|
"reward_std": 0.32902878522872925, |
|
"rewards/improved_len_reward_dast": 0.3171418644487858, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 1826.882568359375, |
|
"epoch": 0.81399870382372, |
|
"grad_norm": 1.4347947835922241, |
|
"kl": 0.44482421875, |
|
"learning_rate": 1.9031370377253574e-07, |
|
"loss": 0.1087, |
|
"reward": 0.43766431510448456, |
|
"reward_std": 0.2934253178536892, |
|
"rewards/improved_len_reward_dast": 0.43766431510448456, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 2287.155548095703, |
|
"epoch": 0.8165910563836681, |
|
"grad_norm": 2.921687126159668, |
|
"kl": 0.751953125, |
|
"learning_rate": 1.8787323895147052e-07, |
|
"loss": 0.0098, |
|
"reward": 0.13603791175410151, |
|
"reward_std": 0.3099226616322994, |
|
"rewards/improved_len_reward_dast": 0.13603791175410151, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 1757.688720703125, |
|
"epoch": 0.8191834089436163, |
|
"grad_norm": 1.0537687540054321, |
|
"kl": 0.3984375, |
|
"learning_rate": 1.8546262831297438e-07, |
|
"loss": 0.0769, |
|
"reward": 0.3936043158173561, |
|
"reward_std": 0.23435594514012337, |
|
"rewards/improved_len_reward_dast": 0.3936043158173561, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 2103.642791748047, |
|
"epoch": 0.8217757615035645, |
|
"grad_norm": 2.2986154556274414, |
|
"kl": 0.72412109375, |
|
"learning_rate": 1.8308207059079938e-07, |
|
"loss": 0.0538, |
|
"reward": 0.2378080729395151, |
|
"reward_std": 0.301589660346508, |
|
"rewards/improved_len_reward_dast": 0.2378080729395151, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 2209.165740966797, |
|
"epoch": 0.8243681140635126, |
|
"grad_norm": 1.0545653104782104, |
|
"kl": 0.5751953125, |
|
"learning_rate": 1.8073176204109837e-07, |
|
"loss": 0.0822, |
|
"reward": 0.36628346890211105, |
|
"reward_std": 0.3119317665696144, |
|
"rewards/improved_len_reward_dast": 0.36628346890211105, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 2171.931121826172, |
|
"epoch": 0.8269604666234608, |
|
"grad_norm": 1.807541847229004, |
|
"kl": 0.59375, |
|
"learning_rate": 1.7841189642624428e-07, |
|
"loss": 0.0284, |
|
"reward": 0.2682526409626007, |
|
"reward_std": 0.29699693247675896, |
|
"rewards/improved_len_reward_dast": 0.2682526409626007, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 2228.471893310547, |
|
"epoch": 0.829552819183409, |
|
"grad_norm": 1.075270652770996, |
|
"kl": 0.5126953125, |
|
"learning_rate": 1.7612266499885642e-07, |
|
"loss": 0.0831, |
|
"reward": 0.36243029683828354, |
|
"reward_std": 0.28352705761790276, |
|
"rewards/improved_len_reward_dast": 0.36243029683828354, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 1616.5943603515625, |
|
"epoch": 0.8321451717433571, |
|
"grad_norm": 1.301566243171692, |
|
"kl": 0.39794921875, |
|
"learning_rate": 1.7386425648603354e-07, |
|
"loss": 0.0878, |
|
"reward": 0.4413522332906723, |
|
"reward_std": 0.24165164679288864, |
|
"rewards/improved_len_reward_dast": 0.4413522332906723, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 1898.8468933105469, |
|
"epoch": 0.8347375243033053, |
|
"grad_norm": 1.221224308013916, |
|
"kl": 0.501953125, |
|
"learning_rate": 1.716368570737946e-07, |
|
"loss": 0.0483, |
|
"reward": 0.2525193989276886, |
|
"reward_std": 0.3362556993961334, |
|
"rewards/improved_len_reward_dast": 0.2525193989276886, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 2523.7422790527344, |
|
"epoch": 0.8373298768632534, |
|
"grad_norm": 1.103893756866455, |
|
"kl": 0.5478515625, |
|
"learning_rate": 1.6944065039173004e-07, |
|
"loss": 0.0645, |
|
"reward": 0.11981333699077368, |
|
"reward_std": 0.3073917515575886, |
|
"rewards/improved_len_reward_dast": 0.11981333699077368, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 2292.2525024414062, |
|
"epoch": 0.8399222294232016, |
|
"grad_norm": 1.1243617534637451, |
|
"kl": 0.4267578125, |
|
"learning_rate": 1.672758174978622e-07, |
|
"loss": 0.0546, |
|
"reward": 0.2594154104590416, |
|
"reward_std": 0.2828039526939392, |
|
"rewards/improved_len_reward_dast": 0.2594154104590416, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 1818.2958984375, |
|
"epoch": 0.8425145819831497, |
|
"grad_norm": 1.7014206647872925, |
|
"kl": 0.4521484375, |
|
"learning_rate": 1.6514253686371917e-07, |
|
"loss": 0.1072, |
|
"reward": 0.3174915425479412, |
|
"reward_std": 0.27957041934132576, |
|
"rewards/improved_len_reward_dast": 0.3174915425479412, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 1835.6147766113281, |
|
"epoch": 0.8451069345430978, |
|
"grad_norm": 0.6114805936813354, |
|
"kl": 0.344482421875, |
|
"learning_rate": 1.630409843596216e-07, |
|
"loss": 0.0602, |
|
"reward": 0.38836774975061417, |
|
"reward_std": 0.27062665671110153, |
|
"rewards/improved_len_reward_dast": 0.38836774975061417, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 1859.8290100097656, |
|
"epoch": 0.847699287103046, |
|
"grad_norm": 1.033449649810791, |
|
"kl": 0.3984375, |
|
"learning_rate": 1.609713332401831e-07, |
|
"loss": 0.0335, |
|
"reward": 0.35275041311979294, |
|
"reward_std": 0.2643149308860302, |
|
"rewards/improved_len_reward_dast": 0.35275041311979294, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 1828.1096496582031, |
|
"epoch": 0.8502916396629941, |
|
"grad_norm": 0.9170458912849426, |
|
"kl": 0.41552734375, |
|
"learning_rate": 1.5893375413002765e-07, |
|
"loss": 0.0429, |
|
"reward": 0.2639412134885788, |
|
"reward_std": 0.26479368656873703, |
|
"rewards/improved_len_reward_dast": 0.2639412134885788, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 2143.4591369628906, |
|
"epoch": 0.8528839922229423, |
|
"grad_norm": 0.4960505962371826, |
|
"kl": 0.32421875, |
|
"learning_rate": 1.569284150097226e-07, |
|
"loss": 0.0477, |
|
"reward": 0.34283383935689926, |
|
"reward_std": 0.24718820676207542, |
|
"rewards/improved_len_reward_dast": 0.34283383935689926, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 1974.7805786132812, |
|
"epoch": 0.8554763447828905, |
|
"grad_norm": 0.8014364838600159, |
|
"kl": 0.52587890625, |
|
"learning_rate": 1.5495548120193003e-07, |
|
"loss": 0.0955, |
|
"reward": 0.2745523639023304, |
|
"reward_std": 0.3219694271683693, |
|
"rewards/improved_len_reward_dast": 0.2745523639023304, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 1617.0892333984375, |
|
"epoch": 0.8580686973428386, |
|
"grad_norm": 1.7895324230194092, |
|
"kl": 0.3525390625, |
|
"learning_rate": 1.5301511535777784e-07, |
|
"loss": 0.1257, |
|
"reward": 0.43920181691646576, |
|
"reward_std": 0.26625148952007294, |
|
"rewards/improved_len_reward_dast": 0.43920181691646576, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 1910.3468627929688, |
|
"epoch": 0.8606610499027868, |
|
"grad_norm": 1.3269976377487183, |
|
"kl": 0.4423828125, |
|
"learning_rate": 1.5110747744345006e-07, |
|
"loss": 0.0978, |
|
"reward": 0.25987571477890015, |
|
"reward_std": 0.32213833928108215, |
|
"rewards/improved_len_reward_dast": 0.25987571477890015, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 1990.3570861816406, |
|
"epoch": 0.863253402462735, |
|
"grad_norm": 1.6192659139633179, |
|
"kl": 0.510986328125, |
|
"learning_rate": 1.4923272472699986e-07, |
|
"loss": 0.0687, |
|
"reward": 0.19616913609206676, |
|
"reward_std": 0.2878040000796318, |
|
"rewards/improved_len_reward_dast": 0.19616913609206676, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 1567.4055786132812, |
|
"epoch": 0.8658457550226831, |
|
"grad_norm": 1.5365605354309082, |
|
"kl": 0.43603515625, |
|
"learning_rate": 1.4739101176538274e-07, |
|
"loss": 0.1329, |
|
"reward": 0.21937411278486252, |
|
"reward_std": 0.3014941178262234, |
|
"rewards/improved_len_reward_dast": 0.21937411278486252, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 1955.3647766113281, |
|
"epoch": 0.8684381075826313, |
|
"grad_norm": 1.001995325088501, |
|
"kl": 0.40283203125, |
|
"learning_rate": 1.4558249039171639e-07, |
|
"loss": 0.0949, |
|
"reward": 0.25975842773914337, |
|
"reward_std": 0.3084189146757126, |
|
"rewards/improved_len_reward_dast": 0.25975842773914337, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 2266.614776611328, |
|
"epoch": 0.8710304601425793, |
|
"grad_norm": 1.8815863132476807, |
|
"kl": 0.4033203125, |
|
"learning_rate": 1.4380730970276195e-07, |
|
"loss": 0.1972, |
|
"reward": 0.2519003488123417, |
|
"reward_std": 0.25883801840245724, |
|
"rewards/improved_len_reward_dast": 0.2519003488123417, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 2940.5254516601562, |
|
"epoch": 0.8736228127025275, |
|
"grad_norm": 1.5017142295837402, |
|
"kl": 0.7333984375, |
|
"learning_rate": 1.420656160466333e-07, |
|
"loss": 0.0465, |
|
"reward": 0.07994039542973042, |
|
"reward_std": 0.2474011294543743, |
|
"rewards/improved_len_reward_dast": 0.07994039542973042, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 2536.4744567871094, |
|
"epoch": 0.8762151652624757, |
|
"grad_norm": 3.5930166244506836, |
|
"kl": 0.65185546875, |
|
"learning_rate": 1.4035755301073102e-07, |
|
"loss": 0.2242, |
|
"reward": 0.2070501446723938, |
|
"reward_std": 0.2260904610157013, |
|
"rewards/improved_len_reward_dast": 0.2070501446723938, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 2507.9974365234375, |
|
"epoch": 0.8788075178224238, |
|
"grad_norm": 2.0312681198120117, |
|
"kl": 0.79931640625, |
|
"learning_rate": 1.386832614099056e-07, |
|
"loss": 0.2886, |
|
"reward": 0.1947159543633461, |
|
"reward_std": 0.25280311703681946, |
|
"rewards/improved_len_reward_dast": 0.1947159543633461, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 2240.9642333984375, |
|
"epoch": 0.881399870382372, |
|
"grad_norm": 6.861503601074219, |
|
"kl": 0.6796875, |
|
"learning_rate": 1.3704287927484846e-07, |
|
"loss": 0.4098, |
|
"reward": 0.09058164700400084, |
|
"reward_std": 0.22857186198234558, |
|
"rewards/improved_len_reward_dast": 0.09058164700400084, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 2856.7295532226562, |
|
"epoch": 0.8839922229423202, |
|
"grad_norm": 1.7314866781234741, |
|
"kl": 0.9296875, |
|
"learning_rate": 1.3543654184071186e-07, |
|
"loss": 0.2576, |
|
"reward": 0.04280344722792506, |
|
"reward_std": 0.23391348123550415, |
|
"rewards/improved_len_reward_dast": 0.04280344722792506, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 3017.0203857421875, |
|
"epoch": 0.8865845755022683, |
|
"grad_norm": 2.1199285984039307, |
|
"kl": 1.123046875, |
|
"learning_rate": 1.3386438153596067e-07, |
|
"loss": 0.259, |
|
"reward": 0.03899642452597618, |
|
"reward_std": 0.1816622130572796, |
|
"rewards/improved_len_reward_dast": 0.03899642452597618, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 2673.2601318359375, |
|
"epoch": 0.8891769280622165, |
|
"grad_norm": 4.129951000213623, |
|
"kl": 1.0849609375, |
|
"learning_rate": 1.323265279714543e-07, |
|
"loss": 0.4278, |
|
"reward": 0.07075950875878334, |
|
"reward_std": 0.17871661111712456, |
|
"rewards/improved_len_reward_dast": 0.07075950875878334, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 2820.4183349609375, |
|
"epoch": 0.8917692806221647, |
|
"grad_norm": 3.029540777206421, |
|
"kl": 1.125, |
|
"learning_rate": 1.3082310792976202e-07, |
|
"loss": 0.2883, |
|
"reward": 0.1679403679445386, |
|
"reward_std": 0.1663094200193882, |
|
"rewards/improved_len_reward_dast": 0.1679403679445386, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 2832.7269897460938, |
|
"epoch": 0.8943616331821128, |
|
"grad_norm": 1.9915223121643066, |
|
"kl": 1.0732421875, |
|
"learning_rate": 1.293542453547102e-07, |
|
"loss": 0.2825, |
|
"reward": 0.09134133439511061, |
|
"reward_std": 0.25903644412755966, |
|
"rewards/improved_len_reward_dast": 0.09134133439511061, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 2574.1351318359375, |
|
"epoch": 0.8969539857420609, |
|
"grad_norm": 2.0541656017303467, |
|
"kl": 1.0693359375, |
|
"learning_rate": 1.279200613411642e-07, |
|
"loss": 0.3294, |
|
"reward": 0.09616942587308586, |
|
"reward_std": 0.2244393788278103, |
|
"rewards/improved_len_reward_dast": 0.09616942587308586, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 3264.4540405273438, |
|
"epoch": 0.899546338302009, |
|
"grad_norm": 2.7875149250030518, |
|
"kl": 1.189453125, |
|
"learning_rate": 1.2652067412504605e-07, |
|
"loss": 0.1564, |
|
"reward": 0.06414215068798512, |
|
"reward_std": 0.20420588552951813, |
|
"rewards/improved_len_reward_dast": 0.06414215068798512, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 2914.2907104492188, |
|
"epoch": 0.9021386908619572, |
|
"grad_norm": 6.438938617706299, |
|
"kl": 1.1162109375, |
|
"learning_rate": 1.251561990735859e-07, |
|
"loss": 0.3186, |
|
"reward": 0.14111983217298985, |
|
"reward_std": 0.1892341412603855, |
|
"rewards/improved_len_reward_dast": 0.14111983217298985, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 3092.1886596679688, |
|
"epoch": 0.9047310434219054, |
|
"grad_norm": 4.280767917633057, |
|
"kl": 0.80224609375, |
|
"learning_rate": 1.238267486758117e-07, |
|
"loss": 0.1811, |
|
"reward": 0.0037063490599393845, |
|
"reward_std": 0.19519924372434616, |
|
"rewards/improved_len_reward_dast": 0.0037063490599393845, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 3299.4285888671875, |
|
"epoch": 0.9073233959818535, |
|
"grad_norm": 1.7542351484298706, |
|
"kl": 0.72265625, |
|
"learning_rate": 1.2253243253327504e-07, |
|
"loss": 0.2208, |
|
"reward": 0.11870704032480717, |
|
"reward_std": 0.20294193923473358, |
|
"rewards/improved_len_reward_dast": 0.11870704032480717, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 2654.012725830078, |
|
"epoch": 0.9099157485418017, |
|
"grad_norm": 8.6357421875, |
|
"kl": 0.44775390625, |
|
"learning_rate": 1.212733573510154e-07, |
|
"loss": 0.2941, |
|
"reward": 0.256839819252491, |
|
"reward_std": 0.22414838150143623, |
|
"rewards/improved_len_reward_dast": 0.256839819252491, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 2815.5816040039062, |
|
"epoch": 0.9125081011017498, |
|
"grad_norm": 4.935699462890625, |
|
"kl": 0.4931640625, |
|
"learning_rate": 1.20049626928764e-07, |
|
"loss": 0.3194, |
|
"reward": 0.1734664011746645, |
|
"reward_std": 0.22984974458813667, |
|
"rewards/improved_len_reward_dast": 0.1734664011746645, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 3032.7295532226562, |
|
"epoch": 0.915100453661698, |
|
"grad_norm": 7.583729267120361, |
|
"kl": 0.67822265625, |
|
"learning_rate": 1.1886134215238539e-07, |
|
"loss": 0.2992, |
|
"reward": 0.13638373278081417, |
|
"reward_std": 0.20758359506726265, |
|
"rewards/improved_len_reward_dast": 0.13638373278081417, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 2698.2423095703125, |
|
"epoch": 0.9176928062216462, |
|
"grad_norm": 9.70632553100586, |
|
"kl": 0.9833984375, |
|
"learning_rate": 1.1770860098556122e-07, |
|
"loss": 0.3735, |
|
"reward": 0.1530514433979988, |
|
"reward_std": 0.20933512970805168, |
|
"rewards/improved_len_reward_dast": 0.1530514433979988, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 2930.4693908691406, |
|
"epoch": 0.9202851587815943, |
|
"grad_norm": 1.9580261707305908, |
|
"kl": 1.13671875, |
|
"learning_rate": 1.1659149846171314e-07, |
|
"loss": 0.2547, |
|
"reward": 0.1397750903852284, |
|
"reward_std": 0.16655682772397995, |
|
"rewards/improved_len_reward_dast": 0.1397750903852284, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 2562.2881774902344, |
|
"epoch": 0.9228775113415425, |
|
"grad_norm": 4.772850036621094, |
|
"kl": 0.8603515625, |
|
"learning_rate": 1.1551012667616889e-07, |
|
"loss": 0.5187, |
|
"reward": 0.17518793791532516, |
|
"reward_std": 0.18449129536747932, |
|
"rewards/improved_len_reward_dast": 0.17518793791532516, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 2741.387725830078, |
|
"epoch": 0.9254698639014906, |
|
"grad_norm": 1.7493088245391846, |
|
"kl": 0.87744140625, |
|
"learning_rate": 1.1446457477856933e-07, |
|
"loss": 0.3361, |
|
"reward": 0.11712268507108092, |
|
"reward_std": 0.22804437577724457, |
|
"rewards/improved_len_reward_dast": 0.11712268507108092, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 2930.1070556640625, |
|
"epoch": 0.9280622164614387, |
|
"grad_norm": 2.6250553131103516, |
|
"kl": 1.537109375, |
|
"learning_rate": 1.1345492896551908e-07, |
|
"loss": 0.2382, |
|
"reward": 0.12876404216513038, |
|
"reward_std": 0.23319095373153687, |
|
"rewards/improved_len_reward_dast": 0.12876404216513038, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 2673.6530151367188, |
|
"epoch": 0.9306545690213869, |
|
"grad_norm": 5.296914100646973, |
|
"kl": 1.3232421875, |
|
"learning_rate": 1.1248127247348025e-07, |
|
"loss": 0.342, |
|
"reward": 0.23917717207223177, |
|
"reward_std": 0.22187871485948563, |
|
"rewards/improved_len_reward_dast": 0.23917717207223177, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 2845.5509643554688, |
|
"epoch": 0.933246921581335, |
|
"grad_norm": 1.8034971952438354, |
|
"kl": 1.294921875, |
|
"learning_rate": 1.1154368557191032e-07, |
|
"loss": 0.3034, |
|
"reward": 0.11301134852692485, |
|
"reward_std": 0.2030489146709442, |
|
"rewards/improved_len_reward_dast": 0.11301134852692485, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 2302.813751220703, |
|
"epoch": 0.9358392741412832, |
|
"grad_norm": 1.5550919771194458, |
|
"kl": 0.9931640625, |
|
"learning_rate": 1.1064224555664489e-07, |
|
"loss": 0.3347, |
|
"reward": 0.16131599247455597, |
|
"reward_std": 0.2146884724497795, |
|
"rewards/improved_len_reward_dast": 0.16131599247455597, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 3010.4871215820312, |
|
"epoch": 0.9384316267012314, |
|
"grad_norm": 1.0812184810638428, |
|
"kl": 1.267578125, |
|
"learning_rate": 1.0977702674352485e-07, |
|
"loss": 0.3206, |
|
"reward": 0.08643259108066559, |
|
"reward_std": 0.19253767281770706, |
|
"rewards/improved_len_reward_dast": 0.08643259108066559, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 2673.1708374023438, |
|
"epoch": 0.9410239792611795, |
|
"grad_norm": 1.4644445180892944, |
|
"kl": 1.01123046875, |
|
"learning_rate": 1.0894810046227007e-07, |
|
"loss": 0.3343, |
|
"reward": 0.18486913572996855, |
|
"reward_std": 0.2324334941804409, |
|
"rewards/improved_len_reward_dast": 0.18486913572996855, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 2397.7474365234375, |
|
"epoch": 0.9436163318211277, |
|
"grad_norm": 3.365234851837158, |
|
"kl": 0.8212890625, |
|
"learning_rate": 1.0815553505059864e-07, |
|
"loss": 0.3498, |
|
"reward": 0.2783215790987015, |
|
"reward_std": 0.20841009542346, |
|
"rewards/improved_len_reward_dast": 0.2783215790987015, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 2809.4030151367188, |
|
"epoch": 0.9462086843810759, |
|
"grad_norm": 4.3384833335876465, |
|
"kl": 0.8330078125, |
|
"learning_rate": 1.0739939584859327e-07, |
|
"loss": 0.2444, |
|
"reward": 0.24181043915450573, |
|
"reward_std": 0.2128814272582531, |
|
"rewards/improved_len_reward_dast": 0.24181043915450573, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 2888.7474365234375, |
|
"epoch": 0.948801036941024, |
|
"grad_norm": 5.5493574142456055, |
|
"kl": 0.7646484375, |
|
"learning_rate": 1.066797451933144e-07, |
|
"loss": 0.2898, |
|
"reward": 0.20839058235287666, |
|
"reward_std": 0.22839120030403137, |
|
"rewards/improved_len_reward_dast": 0.20839058235287666, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 2496.9310302734375, |
|
"epoch": 0.9513933895009722, |
|
"grad_norm": 4.914091110229492, |
|
"kl": 0.8203125, |
|
"learning_rate": 1.0599664241366108e-07, |
|
"loss": 0.2941, |
|
"reward": 0.2661769837141037, |
|
"reward_std": 0.26103585585951805, |
|
"rewards/improved_len_reward_dast": 0.2661769837141037, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 2880.0280151367188, |
|
"epoch": 0.9539857420609202, |
|
"grad_norm": 3.833822011947632, |
|
"kl": 0.83251953125, |
|
"learning_rate": 1.0535014382547976e-07, |
|
"loss": 0.2725, |
|
"reward": 0.2633733693510294, |
|
"reward_std": 0.2882365696132183, |
|
"rewards/improved_len_reward_dast": 0.2633733693510294, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 2686.16064453125, |
|
"epoch": 0.9565780946208684, |
|
"grad_norm": 3.0906360149383545, |
|
"kl": 1.05078125, |
|
"learning_rate": 1.0474030272692176e-07, |
|
"loss": 0.3183, |
|
"reward": 0.21776283904910088, |
|
"reward_std": 0.26876696199178696, |
|
"rewards/improved_len_reward_dast": 0.21776283904910088, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 2501.721923828125, |
|
"epoch": 0.9591704471808166, |
|
"grad_norm": 1.4724576473236084, |
|
"kl": 0.9521484375, |
|
"learning_rate": 1.0416716939404906e-07, |
|
"loss": 0.2768, |
|
"reward": 0.24670540168881416, |
|
"reward_std": 0.24352310225367546, |
|
"rewards/improved_len_reward_dast": 0.24670540168881416, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 2630.0331420898438, |
|
"epoch": 0.9617627997407647, |
|
"grad_norm": 1.545382022857666, |
|
"kl": 1.044921875, |
|
"learning_rate": 1.0363079107668965e-07, |
|
"loss": 0.2864, |
|
"reward": 0.22240487672388554, |
|
"reward_std": 0.22445869073271751, |
|
"rewards/improved_len_reward_dast": 0.22240487672388554, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 2604.4744262695312, |
|
"epoch": 0.9643551523007129, |
|
"grad_norm": 1.856889009475708, |
|
"kl": 1.2001953125, |
|
"learning_rate": 1.03131211994542e-07, |
|
"loss": 0.2622, |
|
"reward": 0.16849582828581333, |
|
"reward_std": 0.216482974588871, |
|
"rewards/improved_len_reward_dast": 0.16849582828581333, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 2533.7601928710938, |
|
"epoch": 0.9669475048606611, |
|
"grad_norm": 1.1367889642715454, |
|
"kl": 1.12841796875, |
|
"learning_rate": 1.0266847333352986e-07, |
|
"loss": 0.3456, |
|
"reward": 0.2437375970184803, |
|
"reward_std": 0.21505925431847572, |
|
"rewards/improved_len_reward_dast": 0.2437375970184803, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 2314.323944091797, |
|
"epoch": 0.9695398574206092, |
|
"grad_norm": 1.5537844896316528, |
|
"kl": 1.2490234375, |
|
"learning_rate": 1.022426132424064e-07, |
|
"loss": 0.3655, |
|
"reward": 0.19233586266636848, |
|
"reward_std": 0.21690138429403305, |
|
"rewards/improved_len_reward_dast": 0.19233586266636848, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 2416.1275329589844, |
|
"epoch": 0.9721322099805574, |
|
"grad_norm": 1.4035214185714722, |
|
"kl": 1.17578125, |
|
"learning_rate": 1.0185366682960968e-07, |
|
"loss": 0.3357, |
|
"reward": 0.20540117495693266, |
|
"reward_std": 0.23184461519122124, |
|
"rewards/improved_len_reward_dast": 0.20540117495693266, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 2568.6044921875, |
|
"epoch": 0.9747245625405055, |
|
"grad_norm": 2.378544330596924, |
|
"kl": 1.412109375, |
|
"learning_rate": 1.015016661603677e-07, |
|
"loss": 0.3565, |
|
"reward": 0.17796143516898155, |
|
"reward_std": 0.18820034340023994, |
|
"rewards/improved_len_reward_dast": 0.17796143516898155, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 2732.6095581054688, |
|
"epoch": 0.9773169151004537, |
|
"grad_norm": 1.6996707916259766, |
|
"kl": 1.330078125, |
|
"learning_rate": 1.011866402540555e-07, |
|
"loss": 0.2989, |
|
"reward": 0.10426153149455786, |
|
"reward_std": 0.20582210645079613, |
|
"rewards/improved_len_reward_dast": 0.10426153149455786, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 2732.6937866210938, |
|
"epoch": 0.9799092676604018, |
|
"grad_norm": 1.3265814781188965, |
|
"kl": 1.0888671875, |
|
"learning_rate": 1.0090861508180229e-07, |
|
"loss": 0.342, |
|
"reward": 0.15975524485111237, |
|
"reward_std": 0.20655079558491707, |
|
"rewards/improved_len_reward_dast": 0.15975524485111237, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 2176.466766357422, |
|
"epoch": 0.9825016202203499, |
|
"grad_norm": 1.45902419090271, |
|
"kl": 1.046875, |
|
"learning_rate": 1.006676135643506e-07, |
|
"loss": 0.4055, |
|
"reward": 0.29564017802476883, |
|
"reward_std": 0.22227967530488968, |
|
"rewards/improved_len_reward_dast": 0.29564017802476883, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 2283.53564453125, |
|
"epoch": 0.9850939727802981, |
|
"grad_norm": 1.839316964149475, |
|
"kl": 0.7939453125, |
|
"learning_rate": 1.004636555701666e-07, |
|
"loss": 0.2705, |
|
"reward": 0.28889062255620956, |
|
"reward_std": 0.24509106576442719, |
|
"rewards/improved_len_reward_dast": 0.28889062255620956, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 2493.9769287109375, |
|
"epoch": 0.9876863253402463, |
|
"grad_norm": 1.2430649995803833, |
|
"kl": 0.533203125, |
|
"learning_rate": 1.0029675791380211e-07, |
|
"loss": 0.2568, |
|
"reward": 0.36698443442583084, |
|
"reward_std": 0.2436152882874012, |
|
"rewards/improved_len_reward_dast": 0.36698443442583084, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 2185.372344970703, |
|
"epoch": 0.9902786779001944, |
|
"grad_norm": 0.8913379907608032, |
|
"kl": 0.5322265625, |
|
"learning_rate": 1.0016693435450846e-07, |
|
"loss": 0.2093, |
|
"reward": 0.3222588375210762, |
|
"reward_std": 0.28892357647418976, |
|
"rewards/improved_len_reward_dast": 0.3222588375210762, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 2258.5382080078125, |
|
"epoch": 0.9928710304601426, |
|
"grad_norm": 0.7607825398445129, |
|
"kl": 0.333251953125, |
|
"learning_rate": 1.00074195595102e-07, |
|
"loss": 0.14, |
|
"reward": 0.42002584785223007, |
|
"reward_std": 0.22813301160931587, |
|
"rewards/improved_len_reward_dast": 0.42002584785223007, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 1829.7269897460938, |
|
"epoch": 0.9954633830200907, |
|
"grad_norm": 1.3492361307144165, |
|
"kl": 0.322021484375, |
|
"learning_rate": 1.0001854928108199e-07, |
|
"loss": 0.2535, |
|
"reward": 0.4132830575108528, |
|
"reward_std": 0.2525113746523857, |
|
"rewards/improved_len_reward_dast": 0.4132830575108528, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 1845.188705444336, |
|
"epoch": 0.9980557355800389, |
|
"grad_norm": 0.7738073468208313, |
|
"kl": 0.290771484375, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0939, |
|
"reward": 0.41781602054834366, |
|
"reward_std": 0.2655208185315132, |
|
"rewards/improved_len_reward_dast": 0.41781602054834366, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9980557355800389, |
|
"step": 385, |
|
"total_flos": 0.0, |
|
"train_loss": 0.051004564216343064, |
|
"train_runtime": 55695.085, |
|
"train_samples_per_second": 0.194, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 385, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 14, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|