|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9985185185185185, |
|
"eval_steps": 500, |
|
"global_step": 337, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 1801.2277526855469, |
|
"epoch": 0.002962962962962963, |
|
"grad_norm": 0.15486460470342944, |
|
"kl": 0.0, |
|
"learning_rate": 2.941176470588235e-08, |
|
"loss": 0.0214, |
|
"reward": 1.2742209732532501, |
|
"reward_std": 0.4368269592523575, |
|
"rewards/accuracy_reward": 0.7901786118745804, |
|
"rewards/cosine_scaled_reward": 0.48404236510396004, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 2185.982208251953, |
|
"epoch": 0.005925925925925926, |
|
"grad_norm": 0.13978450729681857, |
|
"kl": 0.0, |
|
"learning_rate": 5.88235294117647e-08, |
|
"loss": -0.0103, |
|
"reward": 0.9630088359117508, |
|
"reward_std": 0.3557196706533432, |
|
"rewards/accuracy_reward": 0.620535746216774, |
|
"rewards/cosine_scaled_reward": 0.3424730747938156, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 2115.093780517578, |
|
"epoch": 0.008888888888888889, |
|
"grad_norm": 0.1838790154380984, |
|
"kl": 0.00013649463653564453, |
|
"learning_rate": 8.823529411764706e-08, |
|
"loss": 0.1074, |
|
"reward": 1.1710642874240875, |
|
"reward_std": 0.40886973589658737, |
|
"rewards/accuracy_reward": 0.705357164144516, |
|
"rewards/cosine_scaled_reward": 0.46570710837841034, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 2096.4554138183594, |
|
"epoch": 0.011851851851851851, |
|
"grad_norm": 0.136462910218941, |
|
"kl": 0.0001285076141357422, |
|
"learning_rate": 1.176470588235294e-07, |
|
"loss": -0.0027, |
|
"reward": 1.3344760239124298, |
|
"reward_std": 0.5786719098687172, |
|
"rewards/accuracy_reward": 0.7857143431901932, |
|
"rewards/cosine_scaled_reward": 0.5487616658210754, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 1957.7903137207031, |
|
"epoch": 0.014814814814814815, |
|
"grad_norm": 0.14830871529797945, |
|
"kl": 0.00012803077697753906, |
|
"learning_rate": 1.4705882352941175e-07, |
|
"loss": -0.0675, |
|
"reward": 1.1853420436382294, |
|
"reward_std": 0.3382917121052742, |
|
"rewards/accuracy_reward": 0.714285746216774, |
|
"rewards/cosine_scaled_reward": 0.47105635702610016, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 1888.9197082519531, |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 0.12404996018310206, |
|
"kl": 0.0001093149185180664, |
|
"learning_rate": 1.764705882352941e-07, |
|
"loss": 0.0027, |
|
"reward": 1.42927685379982, |
|
"reward_std": 0.31518058851361275, |
|
"rewards/accuracy_reward": 0.8258928805589676, |
|
"rewards/cosine_scaled_reward": 0.60338394343853, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 1843.6385192871094, |
|
"epoch": 0.02074074074074074, |
|
"grad_norm": 0.13872508211750081, |
|
"kl": 0.00011134147644042969, |
|
"learning_rate": 2.0588235294117645e-07, |
|
"loss": 0.0631, |
|
"reward": 1.3414935171604156, |
|
"reward_std": 0.35266988538205624, |
|
"rewards/accuracy_reward": 0.785714328289032, |
|
"rewards/cosine_scaled_reward": 0.5557792335748672, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 2165.2545166015625, |
|
"epoch": 0.023703703703703703, |
|
"grad_norm": 0.13634517766953264, |
|
"kl": 0.0001277923583984375, |
|
"learning_rate": 2.352941176470588e-07, |
|
"loss": -0.009, |
|
"reward": 1.2295733392238617, |
|
"reward_std": 0.4489167779684067, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/cosine_scaled_reward": 0.4885018467903137, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 2090.2500915527344, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 0.13802999500843113, |
|
"kl": 0.0001348257064819336, |
|
"learning_rate": 2.6470588235294114e-07, |
|
"loss": 0.0217, |
|
"reward": 1.1188903898000717, |
|
"reward_std": 0.3858538120985031, |
|
"rewards/accuracy_reward": 0.7053571790456772, |
|
"rewards/cosine_scaled_reward": 0.41353320330381393, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 1973.0134887695312, |
|
"epoch": 0.02962962962962963, |
|
"grad_norm": 0.14065638701142744, |
|
"kl": 0.00011610984802246094, |
|
"learning_rate": 2.941176470588235e-07, |
|
"loss": 0.0352, |
|
"reward": 1.126141995191574, |
|
"reward_std": 0.4128708764910698, |
|
"rewards/accuracy_reward": 0.6964286044239998, |
|
"rewards/cosine_scaled_reward": 0.42971338517963886, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 2356.442108154297, |
|
"epoch": 0.03259259259259259, |
|
"grad_norm": 0.15235664392785112, |
|
"kl": 0.00015854835510253906, |
|
"learning_rate": 3.2352941176470586e-07, |
|
"loss": 0.0137, |
|
"reward": 1.133531704545021, |
|
"reward_std": 0.36801043152809143, |
|
"rewards/accuracy_reward": 0.7187500447034836, |
|
"rewards/cosine_scaled_reward": 0.41478168219327927, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 1793.4420776367188, |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 0.15030605931327, |
|
"kl": 9.799003601074219e-05, |
|
"learning_rate": 3.529411764705882e-07, |
|
"loss": 0.0135, |
|
"reward": 1.1753974556922913, |
|
"reward_std": 0.2614951431751251, |
|
"rewards/accuracy_reward": 0.7187500149011612, |
|
"rewards/cosine_scaled_reward": 0.4566473960876465, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 1617.4732971191406, |
|
"epoch": 0.03851851851851852, |
|
"grad_norm": 0.16439362332784327, |
|
"kl": 9.578466415405273e-05, |
|
"learning_rate": 3.8235294117647053e-07, |
|
"loss": 0.0481, |
|
"reward": 1.3459759652614594, |
|
"reward_std": 0.19704193621873856, |
|
"rewards/accuracy_reward": 0.848214328289032, |
|
"rewards/cosine_scaled_reward": 0.49776165932416916, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 1364.5179443359375, |
|
"epoch": 0.04148148148148148, |
|
"grad_norm": 0.17539080474732854, |
|
"kl": 9.167194366455078e-05, |
|
"learning_rate": 4.117647058823529e-07, |
|
"loss": -0.0146, |
|
"reward": 1.5306860208511353, |
|
"reward_std": 0.2709691859781742, |
|
"rewards/accuracy_reward": 0.9196428954601288, |
|
"rewards/cosine_scaled_reward": 0.6110431402921677, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 1916.4286193847656, |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 0.16486671210804701, |
|
"kl": 0.0001081228256225586, |
|
"learning_rate": 4.4117647058823526e-07, |
|
"loss": 0.0016, |
|
"reward": 1.2406930327415466, |
|
"reward_std": 0.5602811053395271, |
|
"rewards/accuracy_reward": 0.7723214626312256, |
|
"rewards/cosine_scaled_reward": 0.46837157011032104, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 1566.6429443359375, |
|
"epoch": 0.047407407407407405, |
|
"grad_norm": 0.15688724371158902, |
|
"kl": 9.28640365600586e-05, |
|
"learning_rate": 4.705882352941176e-07, |
|
"loss": 0.0135, |
|
"reward": 1.4592451453208923, |
|
"reward_std": 0.39220860973000526, |
|
"rewards/accuracy_reward": 0.8526786118745804, |
|
"rewards/cosine_scaled_reward": 0.606566533446312, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 1577.1116333007812, |
|
"epoch": 0.05037037037037037, |
|
"grad_norm": 0.18124476360697023, |
|
"kl": 0.00010126829147338867, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0808, |
|
"reward": 1.2929720282554626, |
|
"reward_std": 0.33900806307792664, |
|
"rewards/accuracy_reward": 0.7901786118745804, |
|
"rewards/cosine_scaled_reward": 0.5027934014797211, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 1941.6697387695312, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.1480224883739112, |
|
"kl": 0.00013065338134765625, |
|
"learning_rate": 5.294117647058823e-07, |
|
"loss": 0.0215, |
|
"reward": 1.2064578533172607, |
|
"reward_std": 0.5407338961958885, |
|
"rewards/accuracy_reward": 0.7500000298023224, |
|
"rewards/cosine_scaled_reward": 0.45645780116319656, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 2221.3616943359375, |
|
"epoch": 0.056296296296296296, |
|
"grad_norm": 0.18446288084506915, |
|
"kl": 0.00013172626495361328, |
|
"learning_rate": 5.588235294117647e-07, |
|
"loss": 0.1226, |
|
"reward": 1.3185656666755676, |
|
"reward_std": 0.5382702872157097, |
|
"rewards/accuracy_reward": 0.7857143133878708, |
|
"rewards/cosine_scaled_reward": 0.5328513234853745, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 1600.8304748535156, |
|
"epoch": 0.05925925925925926, |
|
"grad_norm": 0.15136971669879618, |
|
"kl": 8.463859558105469e-05, |
|
"learning_rate": 5.88235294117647e-07, |
|
"loss": 0.0009, |
|
"reward": 1.3560706675052643, |
|
"reward_std": 0.27356908470392227, |
|
"rewards/accuracy_reward": 0.7946428954601288, |
|
"rewards/cosine_scaled_reward": 0.5614277720451355, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 1814.1295471191406, |
|
"epoch": 0.06222222222222222, |
|
"grad_norm": 0.14862372479915026, |
|
"kl": 0.00012290477752685547, |
|
"learning_rate": 6.176470588235294e-07, |
|
"loss": 0.0478, |
|
"reward": 1.3167778551578522, |
|
"reward_std": 0.351048968732357, |
|
"rewards/accuracy_reward": 0.7901786118745804, |
|
"rewards/cosine_scaled_reward": 0.5265992805361748, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 2074.214385986328, |
|
"epoch": 0.06518518518518518, |
|
"grad_norm": 0.16607453666951524, |
|
"kl": 0.00013315677642822266, |
|
"learning_rate": 6.470588235294117e-07, |
|
"loss": 0.0513, |
|
"reward": 1.0832807272672653, |
|
"reward_std": 0.39553803764283657, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/cosine_scaled_reward": 0.40470923483371735, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 1610.8572387695312, |
|
"epoch": 0.06814814814814815, |
|
"grad_norm": 0.16726951041321547, |
|
"kl": 9.208917617797852e-05, |
|
"learning_rate": 6.764705882352941e-07, |
|
"loss": 0.0374, |
|
"reward": 1.421825647354126, |
|
"reward_std": 0.3287600055336952, |
|
"rewards/accuracy_reward": 0.8348214626312256, |
|
"rewards/cosine_scaled_reward": 0.5870041102170944, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 1918.0357971191406, |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 0.17652478738451452, |
|
"kl": 0.00012230873107910156, |
|
"learning_rate": 7.058823529411765e-07, |
|
"loss": 0.0782, |
|
"reward": 1.2598042488098145, |
|
"reward_std": 0.3701645387336612, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/cosine_scaled_reward": 0.5187327712774277, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 1735.2188720703125, |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.18159857154961948, |
|
"kl": 0.00011110305786132812, |
|
"learning_rate": 7.352941176470589e-07, |
|
"loss": 0.1127, |
|
"reward": 1.3935684263706207, |
|
"reward_std": 0.3388653099536896, |
|
"rewards/accuracy_reward": 0.8303571939468384, |
|
"rewards/cosine_scaled_reward": 0.5632113069295883, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 1876.6116943359375, |
|
"epoch": 0.07703703703703704, |
|
"grad_norm": 0.14418288739115104, |
|
"kl": 0.00010466575622558594, |
|
"learning_rate": 7.647058823529411e-07, |
|
"loss": 0.0367, |
|
"reward": 1.2820298671722412, |
|
"reward_std": 0.3634565807878971, |
|
"rewards/accuracy_reward": 0.7812500298023224, |
|
"rewards/cosine_scaled_reward": 0.5007798671722412, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 2139.2410888671875, |
|
"epoch": 0.08, |
|
"grad_norm": 0.1588485720271834, |
|
"kl": 0.0001456737518310547, |
|
"learning_rate": 7.941176470588235e-07, |
|
"loss": 0.0693, |
|
"reward": 1.257809430360794, |
|
"reward_std": 0.4351053759455681, |
|
"rewards/accuracy_reward": 0.7723214626312256, |
|
"rewards/cosine_scaled_reward": 0.4854879528284073, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 1813.5581665039062, |
|
"epoch": 0.08296296296296296, |
|
"grad_norm": 0.18155204715794676, |
|
"kl": 0.00012171268463134766, |
|
"learning_rate": 8.235294117647058e-07, |
|
"loss": -0.0196, |
|
"reward": 1.2171168476343155, |
|
"reward_std": 0.47040870040655136, |
|
"rewards/accuracy_reward": 0.7321428954601288, |
|
"rewards/cosine_scaled_reward": 0.4849740155041218, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 1759.1295776367188, |
|
"epoch": 0.08592592592592592, |
|
"grad_norm": 0.1660768801376248, |
|
"kl": 0.00011658668518066406, |
|
"learning_rate": 8.529411764705882e-07, |
|
"loss": 0.0748, |
|
"reward": 1.3603521585464478, |
|
"reward_std": 0.4311341643333435, |
|
"rewards/accuracy_reward": 0.816964328289032, |
|
"rewards/cosine_scaled_reward": 0.5433878675103188, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 2200.4509887695312, |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 0.14464003989927063, |
|
"kl": 0.00014543533325195312, |
|
"learning_rate": 8.823529411764705e-07, |
|
"loss": -0.0008, |
|
"reward": 1.1178553104400635, |
|
"reward_std": 0.41852162033319473, |
|
"rewards/accuracy_reward": 0.6875000298023224, |
|
"rewards/cosine_scaled_reward": 0.4303552433848381, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2137.602813720703, |
|
"epoch": 0.09185185185185185, |
|
"grad_norm": 0.14000193999914415, |
|
"kl": 0.00016105175018310547, |
|
"learning_rate": 9.117647058823529e-07, |
|
"loss": 0.0451, |
|
"reward": 1.04503732919693, |
|
"reward_std": 0.3138642441481352, |
|
"rewards/accuracy_reward": 0.6607142984867096, |
|
"rewards/cosine_scaled_reward": 0.38432304188609123, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 2145.90185546875, |
|
"epoch": 0.09481481481481481, |
|
"grad_norm": 0.14844474863429077, |
|
"kl": 0.0001742839813232422, |
|
"learning_rate": 9.411764705882352e-07, |
|
"loss": 0.0578, |
|
"reward": 1.2176358550786972, |
|
"reward_std": 0.44562142342329025, |
|
"rewards/accuracy_reward": 0.7321428954601288, |
|
"rewards/cosine_scaled_reward": 0.4854929521679878, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 1576.7188568115234, |
|
"epoch": 0.09777777777777778, |
|
"grad_norm": 0.18427378846970008, |
|
"kl": 0.000148773193359375, |
|
"learning_rate": 9.705882352941176e-07, |
|
"loss": 0.0295, |
|
"reward": 1.5268434286117554, |
|
"reward_std": 0.3122905343770981, |
|
"rewards/accuracy_reward": 0.8794643431901932, |
|
"rewards/cosine_scaled_reward": 0.6473791301250458, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 2476.5045166015625, |
|
"epoch": 0.10074074074074074, |
|
"grad_norm": 0.11568502492496938, |
|
"kl": 0.0001811981201171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0036, |
|
"reward": 0.9712123870849609, |
|
"reward_std": 0.44801822677254677, |
|
"rewards/accuracy_reward": 0.6339286118745804, |
|
"rewards/cosine_scaled_reward": 0.33728381246328354, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 2399.5715942382812, |
|
"epoch": 0.1037037037037037, |
|
"grad_norm": 0.13094815811246613, |
|
"kl": 0.00019669532775878906, |
|
"learning_rate": 9.99975812381176e-07, |
|
"loss": 0.047, |
|
"reward": 0.9391455352306366, |
|
"reward_std": 0.4169693812727928, |
|
"rewards/accuracy_reward": 0.611607164144516, |
|
"rewards/cosine_scaled_reward": 0.3275383338332176, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 1674.7232971191406, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.15634669232229934, |
|
"kl": 0.0001952648162841797, |
|
"learning_rate": 9.999032521248854e-07, |
|
"loss": 0.0049, |
|
"reward": 1.107189193367958, |
|
"reward_std": 0.51768758893013, |
|
"rewards/accuracy_reward": 0.7187500298023224, |
|
"rewards/cosine_scaled_reward": 0.38843920081853867, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 1777.2813415527344, |
|
"epoch": 0.10962962962962963, |
|
"grad_norm": 0.13993849205550643, |
|
"kl": 0.00018024444580078125, |
|
"learning_rate": 9.997823270313945e-07, |
|
"loss": -0.0192, |
|
"reward": 1.2558479607105255, |
|
"reward_std": 0.41079528629779816, |
|
"rewards/accuracy_reward": 0.7723214626312256, |
|
"rewards/cosine_scaled_reward": 0.4835265427827835, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 1904.2232971191406, |
|
"epoch": 0.11259259259259259, |
|
"grad_norm": 0.18540633639301507, |
|
"kl": 0.000263214111328125, |
|
"learning_rate": 9.996130501002146e-07, |
|
"loss": 0.0513, |
|
"reward": 1.198735386133194, |
|
"reward_std": 0.4133296310901642, |
|
"rewards/accuracy_reward": 0.7008928805589676, |
|
"rewards/cosine_scaled_reward": 0.4978425204753876, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 2139.1474609375, |
|
"epoch": 0.11555555555555555, |
|
"grad_norm": 0.17053105587422207, |
|
"kl": 0.0003085136413574219, |
|
"learning_rate": 9.99395439528705e-07, |
|
"loss": 0.061, |
|
"reward": 1.083926498889923, |
|
"reward_std": 0.44088590145111084, |
|
"rewards/accuracy_reward": 0.6562500298023224, |
|
"rewards/cosine_scaled_reward": 0.4276764690876007, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 2316.5670471191406, |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 0.14433164353996455, |
|
"kl": 0.0002789497375488281, |
|
"learning_rate": 9.991295187101175e-07, |
|
"loss": 0.0319, |
|
"reward": 0.9402691721916199, |
|
"reward_std": 0.5452702566981316, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/cosine_scaled_reward": 0.3241976648569107, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 1936.9599304199219, |
|
"epoch": 0.12148148148148148, |
|
"grad_norm": 0.15408529457273207, |
|
"kl": 0.0003170967102050781, |
|
"learning_rate": 9.988153162310798e-07, |
|
"loss": -0.038, |
|
"reward": 1.0872374176979065, |
|
"reward_std": 0.3996356353163719, |
|
"rewards/accuracy_reward": 0.6830357313156128, |
|
"rewards/cosine_scaled_reward": 0.4042016677558422, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 1797.6518859863281, |
|
"epoch": 0.12444444444444444, |
|
"grad_norm": 0.1499032007227083, |
|
"kl": 0.0004444122314453125, |
|
"learning_rate": 9.98452865868525e-07, |
|
"loss": 0.0005, |
|
"reward": 1.3414774239063263, |
|
"reward_std": 0.30734935216605663, |
|
"rewards/accuracy_reward": 0.7901786118745804, |
|
"rewards/cosine_scaled_reward": 0.5512988418340683, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 1430.83935546875, |
|
"epoch": 0.1274074074074074, |
|
"grad_norm": 0.21684453788165528, |
|
"kl": 0.0005488395690917969, |
|
"learning_rate": 9.980422065860585e-07, |
|
"loss": 0.1485, |
|
"reward": 1.433362364768982, |
|
"reward_std": 0.3419237732887268, |
|
"rewards/accuracy_reward": 0.8258928805589676, |
|
"rewards/cosine_scaled_reward": 0.607469454407692, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 1854.7366638183594, |
|
"epoch": 0.13037037037037036, |
|
"grad_norm": 0.13990265672119997, |
|
"kl": 0.000537872314453125, |
|
"learning_rate": 9.975833825297694e-07, |
|
"loss": -0.0013, |
|
"reward": 1.2768487930297852, |
|
"reward_std": 0.44341667741537094, |
|
"rewards/accuracy_reward": 0.7723214626312256, |
|
"rewards/cosine_scaled_reward": 0.5045273676514626, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 2298.52685546875, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.14958567896728817, |
|
"kl": 0.0005388259887695312, |
|
"learning_rate": 9.970764430234865e-07, |
|
"loss": 0.0167, |
|
"reward": 0.8812481909990311, |
|
"reward_std": 0.4946395307779312, |
|
"rewards/accuracy_reward": 0.5714286044239998, |
|
"rewards/cosine_scaled_reward": 0.30981964617967606, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 2024.4152526855469, |
|
"epoch": 0.1362962962962963, |
|
"grad_norm": 0.17201604459050468, |
|
"kl": 0.0005998611450195312, |
|
"learning_rate": 9.965214425634744e-07, |
|
"loss": 0.0583, |
|
"reward": 1.2592856585979462, |
|
"reward_std": 0.40784522891044617, |
|
"rewards/accuracy_reward": 0.7544643133878708, |
|
"rewards/cosine_scaled_reward": 0.5048213675618172, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 1278.745620727539, |
|
"epoch": 0.13925925925925925, |
|
"grad_norm": 0.21231358687230578, |
|
"kl": 0.001026153564453125, |
|
"learning_rate": 9.959184408125757e-07, |
|
"loss": 0.0103, |
|
"reward": 1.3836407363414764, |
|
"reward_std": 0.26683217100799084, |
|
"rewards/accuracy_reward": 0.8303571790456772, |
|
"rewards/cosine_scaled_reward": 0.5532835274934769, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 1805.4867248535156, |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 0.15498315626109, |
|
"kl": 0.0008897781372070312, |
|
"learning_rate": 9.952675025937969e-07, |
|
"loss": 0.0114, |
|
"reward": 1.0790625214576721, |
|
"reward_std": 0.34751547686755657, |
|
"rewards/accuracy_reward": 0.6741071790456772, |
|
"rewards/cosine_scaled_reward": 0.40495534986257553, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 1740.5223999023438, |
|
"epoch": 0.1451851851851852, |
|
"grad_norm": 0.168185152396588, |
|
"kl": 0.0010223388671875, |
|
"learning_rate": 9.945686978833404e-07, |
|
"loss": -0.0252, |
|
"reward": 1.312384933233261, |
|
"reward_std": 0.41872425377368927, |
|
"rewards/accuracy_reward": 0.776785746216774, |
|
"rewards/cosine_scaled_reward": 0.5355991423130035, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 2063.5000915527344, |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.15180436763056188, |
|
"kl": 0.0008392333984375, |
|
"learning_rate": 9.938221018030818e-07, |
|
"loss": -0.0333, |
|
"reward": 1.2742985486984253, |
|
"reward_std": 0.44013547897338867, |
|
"rewards/accuracy_reward": 0.7767857611179352, |
|
"rewards/cosine_scaled_reward": 0.4975127503275871, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 2093.4598999023438, |
|
"epoch": 0.1511111111111111, |
|
"grad_norm": 0.15345817763060737, |
|
"kl": 0.0010747909545898438, |
|
"learning_rate": 9.930277946124936e-07, |
|
"loss": 0.0163, |
|
"reward": 1.179248034954071, |
|
"reward_std": 0.3069497048854828, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/cosine_scaled_reward": 0.48281943798065186, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 1701.2678833007812, |
|
"epoch": 0.15407407407407409, |
|
"grad_norm": 0.49339086071731963, |
|
"kl": 0.0011758804321289062, |
|
"learning_rate": 9.921858617000186e-07, |
|
"loss": 0.0374, |
|
"reward": 1.399949163198471, |
|
"reward_std": 0.41836266964673996, |
|
"rewards/accuracy_reward": 0.8392857611179352, |
|
"rewards/cosine_scaled_reward": 0.5606633871793747, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 1969.9330749511719, |
|
"epoch": 0.15703703703703703, |
|
"grad_norm": 0.15802222520538123, |
|
"kl": 0.0012388229370117188, |
|
"learning_rate": 9.912963935738895e-07, |
|
"loss": 0.0166, |
|
"reward": 1.2939041405916214, |
|
"reward_std": 0.44797487556934357, |
|
"rewards/accuracy_reward": 0.7678571790456772, |
|
"rewards/cosine_scaled_reward": 0.5260469764471054, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 2150.6162109375, |
|
"epoch": 0.16, |
|
"grad_norm": 0.14836385455922624, |
|
"kl": 0.0013933181762695312, |
|
"learning_rate": 9.903594858523993e-07, |
|
"loss": 0.0438, |
|
"reward": 1.2006288468837738, |
|
"reward_std": 0.38477384112775326, |
|
"rewards/accuracy_reward": 0.7187500298023224, |
|
"rewards/cosine_scaled_reward": 0.481878824532032, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 2185.0983276367188, |
|
"epoch": 0.16296296296296298, |
|
"grad_norm": 0.16192437178799024, |
|
"kl": 0.001556396484375, |
|
"learning_rate": 9.893752392536231e-07, |
|
"loss": 0.0437, |
|
"reward": 1.0603131651878357, |
|
"reward_std": 0.4352143183350563, |
|
"rewards/accuracy_reward": 0.6473214626312256, |
|
"rewards/cosine_scaled_reward": 0.4129916988313198, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 2409.43310546875, |
|
"epoch": 0.16592592592592592, |
|
"grad_norm": 0.1424512212234098, |
|
"kl": 0.001216888427734375, |
|
"learning_rate": 9.883437595845901e-07, |
|
"loss": 0.0462, |
|
"reward": 0.998899444937706, |
|
"reward_std": 0.4582284800708294, |
|
"rewards/accuracy_reward": 0.629464328289032, |
|
"rewards/cosine_scaled_reward": 0.3694351278245449, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 1719.02685546875, |
|
"epoch": 0.1688888888888889, |
|
"grad_norm": 0.1725324452769341, |
|
"kl": 0.0020580291748046875, |
|
"learning_rate": 9.872651577299092e-07, |
|
"loss": 0.0393, |
|
"reward": 1.3792076706886292, |
|
"reward_std": 0.27744055911898613, |
|
"rewards/accuracy_reward": 0.7991071939468384, |
|
"rewards/cosine_scaled_reward": 0.5801005065441132, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 2229.2858276367188, |
|
"epoch": 0.17185185185185184, |
|
"grad_norm": 0.12530389854229343, |
|
"kl": 0.0016937255859375, |
|
"learning_rate": 9.861395496398497e-07, |
|
"loss": 0.0243, |
|
"reward": 1.0262050777673721, |
|
"reward_std": 0.44111158698797226, |
|
"rewards/accuracy_reward": 0.6473214626312256, |
|
"rewards/cosine_scaled_reward": 0.378883657976985, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 1460.5536499023438, |
|
"epoch": 0.1748148148148148, |
|
"grad_norm": 0.17309982186787048, |
|
"kl": 0.0025787353515625, |
|
"learning_rate": 9.849670563178756e-07, |
|
"loss": -0.0294, |
|
"reward": 1.4032874703407288, |
|
"reward_std": 0.42625588178634644, |
|
"rewards/accuracy_reward": 0.8125000447034836, |
|
"rewards/cosine_scaled_reward": 0.5907874628901482, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 1460.8750762939453, |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.1850063622528884, |
|
"kl": 0.0031528472900390625, |
|
"learning_rate": 9.83747803807638e-07, |
|
"loss": 0.0411, |
|
"reward": 1.3028863370418549, |
|
"reward_std": 0.3087996020913124, |
|
"rewards/accuracy_reward": 0.7589286267757416, |
|
"rewards/cosine_scaled_reward": 0.5439577773213387, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 1489.8438110351562, |
|
"epoch": 0.18074074074074073, |
|
"grad_norm": 0.17806944014811066, |
|
"kl": 0.00299835205078125, |
|
"learning_rate": 9.82481923179426e-07, |
|
"loss": 0.0554, |
|
"reward": 1.4762336909770966, |
|
"reward_std": 0.3607482761144638, |
|
"rewards/accuracy_reward": 0.8348214626312256, |
|
"rewards/cosine_scaled_reward": 0.6414122134447098, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 1651.0090026855469, |
|
"epoch": 0.1837037037037037, |
|
"grad_norm": 0.16163382863877152, |
|
"kl": 0.0036773681640625, |
|
"learning_rate": 9.811695505160755e-07, |
|
"loss": 0.0263, |
|
"reward": 1.3000611364841461, |
|
"reward_std": 0.4057988375425339, |
|
"rewards/accuracy_reward": 0.7812500447034836, |
|
"rewards/cosine_scaled_reward": 0.5188110917806625, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 1794.1473999023438, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.18644937494598265, |
|
"kl": 0.003452301025390625, |
|
"learning_rate": 9.79810826898341e-07, |
|
"loss": 0.0748, |
|
"reward": 1.2813213169574738, |
|
"reward_std": 0.3088081255555153, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/cosine_scaled_reward": 0.5402498543262482, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 1571.9688110351562, |
|
"epoch": 0.18962962962962962, |
|
"grad_norm": 0.1853175840562683, |
|
"kl": 0.003986358642578125, |
|
"learning_rate": 9.784058983897284e-07, |
|
"loss": 0.0044, |
|
"reward": 1.292844980955124, |
|
"reward_std": 0.3763498868793249, |
|
"rewards/accuracy_reward": 0.754464328289032, |
|
"rewards/cosine_scaled_reward": 0.5383806526660919, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 1578.4866943359375, |
|
"epoch": 0.1925925925925926, |
|
"grad_norm": 0.15904947917513404, |
|
"kl": 0.0043487548828125, |
|
"learning_rate": 9.769549160207952e-07, |
|
"loss": 0.0214, |
|
"reward": 1.1014029830694199, |
|
"reward_std": 0.43177659809589386, |
|
"rewards/accuracy_reward": 0.6741071492433548, |
|
"rewards/cosine_scaled_reward": 0.42729582637548447, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 1857.227783203125, |
|
"epoch": 0.19555555555555557, |
|
"grad_norm": 0.1606816895577705, |
|
"kl": 0.00336456298828125, |
|
"learning_rate": 9.754580357729116e-07, |
|
"loss": 0.0669, |
|
"reward": 1.3189943879842758, |
|
"reward_std": 0.3866711165755987, |
|
"rewards/accuracy_reward": 0.7723214626312256, |
|
"rewards/cosine_scaled_reward": 0.5466729700565338, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 1460.6875457763672, |
|
"epoch": 0.1985185185185185, |
|
"grad_norm": 0.19474404173280954, |
|
"kl": 0.005466461181640625, |
|
"learning_rate": 9.739154185614949e-07, |
|
"loss": 0.0251, |
|
"reward": 1.337535835802555, |
|
"reward_std": 0.2742351219058037, |
|
"rewards/accuracy_reward": 0.7633928880095482, |
|
"rewards/cosine_scaled_reward": 0.5741428937762976, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 1658.80810546875, |
|
"epoch": 0.20148148148148148, |
|
"grad_norm": 0.1747359976489889, |
|
"kl": 0.0050048828125, |
|
"learning_rate": 9.723272302187106e-07, |
|
"loss": -0.0185, |
|
"reward": 1.2206310108304024, |
|
"reward_std": 0.3201807998120785, |
|
"rewards/accuracy_reward": 0.7098214700818062, |
|
"rewards/cosine_scaled_reward": 0.510809512808919, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 1545.9509887695312, |
|
"epoch": 0.20444444444444446, |
|
"grad_norm": 0.16313740882439987, |
|
"kl": 0.00656890869140625, |
|
"learning_rate": 9.706936414756435e-07, |
|
"loss": 0.0095, |
|
"reward": 1.282968521118164, |
|
"reward_std": 0.4132317379117012, |
|
"rewards/accuracy_reward": 0.7187500298023224, |
|
"rewards/cosine_scaled_reward": 0.5642184466123581, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 1925.4911804199219, |
|
"epoch": 0.2074074074074074, |
|
"grad_norm": 0.17244646258164678, |
|
"kl": 0.0052947998046875, |
|
"learning_rate": 9.69014827943947e-07, |
|
"loss": 0.048, |
|
"reward": 0.9535009413957596, |
|
"reward_std": 0.36283179745078087, |
|
"rewards/accuracy_reward": 0.6250000298023224, |
|
"rewards/cosine_scaled_reward": 0.328500933945179, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 1576.6786346435547, |
|
"epoch": 0.21037037037037037, |
|
"grad_norm": 0.17458726146815962, |
|
"kl": 0.006336212158203125, |
|
"learning_rate": 9.672909700969612e-07, |
|
"loss": 0.0494, |
|
"reward": 1.3331668823957443, |
|
"reward_std": 0.4533561021089554, |
|
"rewards/accuracy_reward": 0.7767857313156128, |
|
"rewards/cosine_scaled_reward": 0.5563811622560024, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 1622.2277221679688, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.18810519087736544, |
|
"kl": 0.00846099853515625, |
|
"learning_rate": 9.65522253250316e-07, |
|
"loss": 0.062, |
|
"reward": 1.2747989892959595, |
|
"reward_std": 0.3764601796865463, |
|
"rewards/accuracy_reward": 0.7321428805589676, |
|
"rewards/cosine_scaled_reward": 0.5426560789346695, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 1628.651870727539, |
|
"epoch": 0.2162962962962963, |
|
"grad_norm": 0.1913056968217199, |
|
"kl": 0.0076446533203125, |
|
"learning_rate": 9.637088675420063e-07, |
|
"loss": 0.0442, |
|
"reward": 1.371776431798935, |
|
"reward_std": 0.4677218608558178, |
|
"rewards/accuracy_reward": 0.7946428954601288, |
|
"rewards/cosine_scaled_reward": 0.577133521437645, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 1836.1116943359375, |
|
"epoch": 0.21925925925925926, |
|
"grad_norm": 0.16928420028282418, |
|
"kl": 0.00873565673828125, |
|
"learning_rate": 9.618510079119533e-07, |
|
"loss": 0.0887, |
|
"reward": 1.2351452708244324, |
|
"reward_std": 0.3623766005039215, |
|
"rewards/accuracy_reward": 0.7276786118745804, |
|
"rewards/cosine_scaled_reward": 0.5074666365981102, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 1350.9152526855469, |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.1893299932553231, |
|
"kl": 0.009857177734375, |
|
"learning_rate": 9.59948874081048e-07, |
|
"loss": 0.0722, |
|
"reward": 1.0651443749666214, |
|
"reward_std": 0.486992284655571, |
|
"rewards/accuracy_reward": 0.6651786118745804, |
|
"rewards/cosine_scaled_reward": 0.3999657705426216, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 1341.65185546875, |
|
"epoch": 0.22518518518518518, |
|
"grad_norm": 0.19874429784134148, |
|
"kl": 0.0109100341796875, |
|
"learning_rate": 9.580026705296824e-07, |
|
"loss": 0.0374, |
|
"reward": 1.4165225625038147, |
|
"reward_std": 0.3073725774884224, |
|
"rewards/accuracy_reward": 0.7991071790456772, |
|
"rewards/cosine_scaled_reward": 0.6174153983592987, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 1584.9866485595703, |
|
"epoch": 0.22814814814814816, |
|
"grad_norm": 0.192470093500917, |
|
"kl": 0.0110626220703125, |
|
"learning_rate": 9.56012606475766e-07, |
|
"loss": 0.036, |
|
"reward": 1.2434572279453278, |
|
"reward_std": 0.42061255872249603, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/cosine_scaled_reward": 0.5291714444756508, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 1354.6607666015625, |
|
"epoch": 0.2311111111111111, |
|
"grad_norm": 0.20023302979275232, |
|
"kl": 0.012298583984375, |
|
"learning_rate": 9.539788958522353e-07, |
|
"loss": 0.0618, |
|
"reward": 1.3703485876321793, |
|
"reward_std": 0.39164508879184723, |
|
"rewards/accuracy_reward": 0.785714328289032, |
|
"rewards/cosine_scaled_reward": 0.5846342295408249, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 1133.839340209961, |
|
"epoch": 0.23407407407407407, |
|
"grad_norm": 0.1842508631519829, |
|
"kl": 0.0138397216796875, |
|
"learning_rate": 9.519017572840562e-07, |
|
"loss": 0.0505, |
|
"reward": 1.3714804649353027, |
|
"reward_std": 0.41867052018642426, |
|
"rewards/accuracy_reward": 0.7857143133878708, |
|
"rewards/cosine_scaled_reward": 0.5857661366462708, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 1497.263412475586, |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 0.1959550919083615, |
|
"kl": 0.0127410888671875, |
|
"learning_rate": 9.49781414064722e-07, |
|
"loss": 0.0741, |
|
"reward": 1.1733836829662323, |
|
"reward_std": 0.5299587771296501, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.4635622203350067, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 1383.0313262939453, |
|
"epoch": 0.24, |
|
"grad_norm": 0.21509846180208117, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 9.476180941322485e-07, |
|
"loss": 0.0916, |
|
"reward": 1.1244508922100067, |
|
"reward_std": 0.31519509851932526, |
|
"rewards/accuracy_reward": 0.6562500223517418, |
|
"rewards/cosine_scaled_reward": 0.4682008996605873, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 1542.4465026855469, |
|
"epoch": 0.24296296296296296, |
|
"grad_norm": 0.17903856801808032, |
|
"kl": 0.0135650634765625, |
|
"learning_rate": 9.454120300446708e-07, |
|
"loss": 0.1177, |
|
"reward": 1.032856598496437, |
|
"reward_std": 0.37768274173140526, |
|
"rewards/accuracy_reward": 0.6160714328289032, |
|
"rewards/cosine_scaled_reward": 0.4167851284146309, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 1229.1741638183594, |
|
"epoch": 0.24592592592592594, |
|
"grad_norm": 0.1895334108803511, |
|
"kl": 0.0172882080078125, |
|
"learning_rate": 9.431634589550437e-07, |
|
"loss": 0.0994, |
|
"reward": 1.2515216022729874, |
|
"reward_std": 0.329837616533041, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.5417001619935036, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 1215.3705749511719, |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 0.25854148885682715, |
|
"kl": 0.0201416015625, |
|
"learning_rate": 9.408726225859463e-07, |
|
"loss": -0.029, |
|
"reward": 1.17463618516922, |
|
"reward_std": 0.4334421083331108, |
|
"rewards/accuracy_reward": 0.6651785969734192, |
|
"rewards/cosine_scaled_reward": 0.5094575956463814, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 1066.089340209961, |
|
"epoch": 0.2518518518518518, |
|
"grad_norm": 0.2213349369656864, |
|
"kl": 0.021392822265625, |
|
"learning_rate": 9.385397672034984e-07, |
|
"loss": 0.1045, |
|
"reward": 1.4549466967582703, |
|
"reward_std": 0.3001119792461395, |
|
"rewards/accuracy_reward": 0.8125000447034836, |
|
"rewards/cosine_scaled_reward": 0.6424466073513031, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 1157.5848693847656, |
|
"epoch": 0.2548148148148148, |
|
"grad_norm": 0.22308540448597394, |
|
"kl": 0.024993896484375, |
|
"learning_rate": 9.361651435908859e-07, |
|
"loss": 0.0057, |
|
"reward": 1.383677989244461, |
|
"reward_std": 0.4137191101908684, |
|
"rewards/accuracy_reward": 0.7633928954601288, |
|
"rewards/cosine_scaled_reward": 0.6202851235866547, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 1113.575942993164, |
|
"epoch": 0.2577777777777778, |
|
"grad_norm": 0.2376225453935383, |
|
"kl": 0.028778076171875, |
|
"learning_rate": 9.337490070214005e-07, |
|
"loss": 0.0684, |
|
"reward": 1.496872365474701, |
|
"reward_std": 0.41324392706155777, |
|
"rewards/accuracy_reward": 0.8125000298023224, |
|
"rewards/cosine_scaled_reward": 0.6843723505735397, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 944.7812957763672, |
|
"epoch": 0.2607407407407407, |
|
"grad_norm": 0.20249928269924236, |
|
"kl": 0.02642822265625, |
|
"learning_rate": 9.312916172309998e-07, |
|
"loss": 0.0757, |
|
"reward": 1.4803976714611053, |
|
"reward_std": 0.3520447090268135, |
|
"rewards/accuracy_reward": 0.8125000447034836, |
|
"rewards/cosine_scaled_reward": 0.667897641658783, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 1494.6563110351562, |
|
"epoch": 0.2637037037037037, |
|
"grad_norm": 0.19755508814823958, |
|
"kl": 0.02459716796875, |
|
"learning_rate": 9.287932383903842e-07, |
|
"loss": 0.0235, |
|
"reward": 1.2657422125339508, |
|
"reward_std": 0.35956617817282677, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/cosine_scaled_reward": 0.5514564663171768, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 1329.3214721679688, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.21208392637462176, |
|
"kl": 0.02642822265625, |
|
"learning_rate": 9.262541390765981e-07, |
|
"loss": 0.0319, |
|
"reward": 1.1303415894508362, |
|
"reward_std": 0.5117903053760529, |
|
"rewards/accuracy_reward": 0.6607143133878708, |
|
"rewards/cosine_scaled_reward": 0.4696272984147072, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 942.5982666015625, |
|
"epoch": 0.2696296296296296, |
|
"grad_norm": 0.24191683773068134, |
|
"kl": 0.035491943359375, |
|
"learning_rate": 9.236745922441589e-07, |
|
"loss": 0.0795, |
|
"reward": 1.430460512638092, |
|
"reward_std": 0.3666737973690033, |
|
"rewards/accuracy_reward": 0.7812500298023224, |
|
"rewards/cosine_scaled_reward": 0.6492104828357697, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 1154.8437957763672, |
|
"epoch": 0.2725925925925926, |
|
"grad_norm": 0.19725864085814365, |
|
"kl": 0.030853271484375, |
|
"learning_rate": 9.210548751957133e-07, |
|
"loss": -0.0129, |
|
"reward": 1.3163889944553375, |
|
"reward_std": 0.5869083181023598, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/cosine_scaled_reward": 0.5753175765275955, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 1329.2678833007812, |
|
"epoch": 0.27555555555555555, |
|
"grad_norm": 0.1823845016827509, |
|
"kl": 0.03363037109375, |
|
"learning_rate": 9.183952695522273e-07, |
|
"loss": 0.0291, |
|
"reward": 1.2691433429718018, |
|
"reward_std": 0.4903425797820091, |
|
"rewards/accuracy_reward": 0.7098214775323868, |
|
"rewards/cosine_scaled_reward": 0.5593218728899956, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 1625.6563262939453, |
|
"epoch": 0.2785185185185185, |
|
"grad_norm": 0.2494826179007093, |
|
"kl": 0.03369140625, |
|
"learning_rate": 9.156960612227125e-07, |
|
"loss": 0.0871, |
|
"reward": 1.1840568780899048, |
|
"reward_std": 0.4199274815618992, |
|
"rewards/accuracy_reward": 0.6696428954601288, |
|
"rewards/cosine_scaled_reward": 0.5144139900803566, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 1420.6339721679688, |
|
"epoch": 0.2814814814814815, |
|
"grad_norm": 0.2102867426975808, |
|
"kl": 0.03350830078125, |
|
"learning_rate": 9.129575403734897e-07, |
|
"loss": 0.0645, |
|
"reward": 1.2297062426805496, |
|
"reward_std": 0.47686289995908737, |
|
"rewards/accuracy_reward": 0.6964286118745804, |
|
"rewards/cosine_scaled_reward": 0.5332776308059692, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 1161.6607818603516, |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 0.19499198339251772, |
|
"kl": 0.041473388671875, |
|
"learning_rate": 9.101800013969962e-07, |
|
"loss": 0.0625, |
|
"reward": 1.1435084491968155, |
|
"reward_std": 0.4009154736995697, |
|
"rewards/accuracy_reward": 0.647321455180645, |
|
"rewards/cosine_scaled_reward": 0.496186975389719, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 1372.5357818603516, |
|
"epoch": 0.2874074074074074, |
|
"grad_norm": 0.19600482088111124, |
|
"kl": 0.039581298828125, |
|
"learning_rate": 9.07363742880139e-07, |
|
"loss": 0.0203, |
|
"reward": 1.3975748717784882, |
|
"reward_std": 0.44245097786188126, |
|
"rewards/accuracy_reward": 0.76339291036129, |
|
"rewards/cosine_scaled_reward": 0.634181946516037, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 1010.8259429931641, |
|
"epoch": 0.2903703703703704, |
|
"grad_norm": 0.20258954456169906, |
|
"kl": 0.04852294921875, |
|
"learning_rate": 9.045090675721959e-07, |
|
"loss": 0.0426, |
|
"reward": 1.2528180032968521, |
|
"reward_std": 0.5208085626363754, |
|
"rewards/accuracy_reward": 0.7008928805589676, |
|
"rewards/cosine_scaled_reward": 0.5519250854849815, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 1133.808090209961, |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 0.24010785767735876, |
|
"kl": 0.0523681640625, |
|
"learning_rate": 9.016162823522701e-07, |
|
"loss": -0.0414, |
|
"reward": 1.1999054104089737, |
|
"reward_std": 0.40023650601506233, |
|
"rewards/accuracy_reward": 0.6830357313156128, |
|
"rewards/cosine_scaled_reward": 0.5168696194887161, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 868.0268249511719, |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.2556790055127662, |
|
"kl": 0.05438232421875, |
|
"learning_rate": 8.986856981963004e-07, |
|
"loss": 0.0141, |
|
"reward": 1.2974393367767334, |
|
"reward_std": 0.42083971202373505, |
|
"rewards/accuracy_reward": 0.7098214775323868, |
|
"rewards/cosine_scaled_reward": 0.5876179337501526, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 1285.05810546875, |
|
"epoch": 0.2992592592592593, |
|
"grad_norm": 0.2672184140592102, |
|
"kl": 0.0479736328125, |
|
"learning_rate": 8.957176301436312e-07, |
|
"loss": 0.0897, |
|
"reward": 1.2107618898153305, |
|
"reward_std": 0.4981778487563133, |
|
"rewards/accuracy_reward": 0.6919643133878708, |
|
"rewards/cosine_scaled_reward": 0.5187975913286209, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 1239.4420013427734, |
|
"epoch": 0.3022222222222222, |
|
"grad_norm": 0.26029036344061485, |
|
"kl": 0.051483154296875, |
|
"learning_rate": 8.927123972631457e-07, |
|
"loss": 0.135, |
|
"reward": 1.3805895149707794, |
|
"reward_std": 0.4160504639148712, |
|
"rewards/accuracy_reward": 0.7589286118745804, |
|
"rewards/cosine_scaled_reward": 0.6216609328985214, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 779.2232666015625, |
|
"epoch": 0.30518518518518517, |
|
"grad_norm": 0.27072308594934286, |
|
"kl": 0.06658935546875, |
|
"learning_rate": 8.896703226189656e-07, |
|
"loss": 0.0549, |
|
"reward": 1.3102031350135803, |
|
"reward_std": 0.4214232973754406, |
|
"rewards/accuracy_reward": 0.7232143059372902, |
|
"rewards/cosine_scaled_reward": 0.5869888141751289, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 1419.3795471191406, |
|
"epoch": 0.30814814814814817, |
|
"grad_norm": 0.24404839226709768, |
|
"kl": 0.05548095703125, |
|
"learning_rate": 8.865917332357217e-07, |
|
"loss": -0.0445, |
|
"reward": 1.1191436797380447, |
|
"reward_std": 0.5001409500837326, |
|
"rewards/accuracy_reward": 0.6383928805589676, |
|
"rewards/cosine_scaled_reward": 0.48075081408023834, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 1380.4598846435547, |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 0.22891576288171045, |
|
"kl": 0.0596923828125, |
|
"learning_rate": 8.834769600633986e-07, |
|
"loss": 0.0395, |
|
"reward": 1.190809726715088, |
|
"reward_std": 0.4635982885956764, |
|
"rewards/accuracy_reward": 0.6607143133878708, |
|
"rewards/cosine_scaled_reward": 0.5300954133272171, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 1261.1250610351562, |
|
"epoch": 0.31407407407407406, |
|
"grad_norm": 0.2543103499852788, |
|
"kl": 0.072265625, |
|
"learning_rate": 8.803263379417572e-07, |
|
"loss": 0.0503, |
|
"reward": 1.297232449054718, |
|
"reward_std": 0.4232187941670418, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.5874110013246536, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 1787.1965026855469, |
|
"epoch": 0.31703703703703706, |
|
"grad_norm": 0.16701870028955876, |
|
"kl": 0.042205810546875, |
|
"learning_rate": 8.771402055643391e-07, |
|
"loss": 0.0263, |
|
"reward": 0.8708714246749878, |
|
"reward_std": 0.5340973809361458, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/cosine_scaled_reward": 0.3351571261882782, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 1069.0223770141602, |
|
"epoch": 0.32, |
|
"grad_norm": 0.245970485062095, |
|
"kl": 0.0693359375, |
|
"learning_rate": 8.73918905442058e-07, |
|
"loss": 0.1147, |
|
"reward": 1.1969991326332092, |
|
"reward_std": 0.3443680591881275, |
|
"rewards/accuracy_reward": 0.6562500298023224, |
|
"rewards/cosine_scaled_reward": 0.5407490879297256, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 1027.7768249511719, |
|
"epoch": 0.32296296296296295, |
|
"grad_norm": 0.2561486876196046, |
|
"kl": 0.0853271484375, |
|
"learning_rate": 8.706627838663782e-07, |
|
"loss": 0.0559, |
|
"reward": 1.1508228331804276, |
|
"reward_std": 0.5210666060447693, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/cosine_scaled_reward": 0.5168942138552666, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 1099.7411499023438, |
|
"epoch": 0.32592592592592595, |
|
"grad_norm": 0.25517403843187497, |
|
"kl": 0.06573486328125, |
|
"learning_rate": 8.673721908720884e-07, |
|
"loss": 0.0741, |
|
"reward": 1.2350784838199615, |
|
"reward_std": 0.4044996239244938, |
|
"rewards/accuracy_reward": 0.6875000447034836, |
|
"rewards/cosine_scaled_reward": 0.5475784614682198, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 1189.6741638183594, |
|
"epoch": 0.3288888888888889, |
|
"grad_norm": 0.23395331806651343, |
|
"kl": 0.0775146484375, |
|
"learning_rate": 8.640474801996732e-07, |
|
"loss": 0.0665, |
|
"reward": 1.3876985013484955, |
|
"reward_std": 0.3757442235946655, |
|
"rewards/accuracy_reward": 0.7678571790456772, |
|
"rewards/cosine_scaled_reward": 0.6198412925004959, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 977.1652145385742, |
|
"epoch": 0.33185185185185184, |
|
"grad_norm": 0.28043157986328343, |
|
"kl": 0.088134765625, |
|
"learning_rate": 8.606890092572861e-07, |
|
"loss": -0.0177, |
|
"reward": 1.1703044474124908, |
|
"reward_std": 0.5287574678659439, |
|
"rewards/accuracy_reward": 0.6428571790456772, |
|
"rewards/cosine_scaled_reward": 0.5274473056197166, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 1058.357162475586, |
|
"epoch": 0.3348148148148148, |
|
"grad_norm": 0.28507497649017294, |
|
"kl": 0.08673095703125, |
|
"learning_rate": 8.572971390823266e-07, |
|
"loss": 0.0739, |
|
"reward": 1.270705059170723, |
|
"reward_std": 0.5322270393371582, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/cosine_scaled_reward": 0.5742765069007874, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 1392.9152374267578, |
|
"epoch": 0.3377777777777778, |
|
"grad_norm": 0.22063315207690926, |
|
"kl": 0.0809326171875, |
|
"learning_rate": 8.538722343026302e-07, |
|
"loss": 0.0446, |
|
"reward": 1.007172241806984, |
|
"reward_std": 0.37207865715026855, |
|
"rewards/accuracy_reward": 0.5625000223517418, |
|
"rewards/cosine_scaled_reward": 0.44467223435640335, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 859.0893211364746, |
|
"epoch": 0.34074074074074073, |
|
"grad_norm": 0.300236056528611, |
|
"kl": 0.1016845703125, |
|
"learning_rate": 8.50414663097269e-07, |
|
"loss": 0.0039, |
|
"reward": 1.4534207880496979, |
|
"reward_std": 0.5292069166898727, |
|
"rewards/accuracy_reward": 0.776785746216774, |
|
"rewards/cosine_scaled_reward": 0.6766350567340851, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 1658.6161651611328, |
|
"epoch": 0.3437037037037037, |
|
"grad_norm": 0.21810828567849236, |
|
"kl": 0.0694580078125, |
|
"learning_rate": 8.46924797156974e-07, |
|
"loss": 0.0056, |
|
"reward": 1.076892763376236, |
|
"reward_std": 0.5404257103800774, |
|
"rewards/accuracy_reward": 0.6116071790456772, |
|
"rewards/cosine_scaled_reward": 0.46528560668230057, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 1132.7455749511719, |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 0.2561231185493826, |
|
"kl": 0.0867919921875, |
|
"learning_rate": 8.434030116441765e-07, |
|
"loss": 0.0196, |
|
"reward": 1.271399825811386, |
|
"reward_std": 0.5448361113667488, |
|
"rewards/accuracy_reward": 0.705357164144516, |
|
"rewards/cosine_scaled_reward": 0.5660426765680313, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 1571.7902526855469, |
|
"epoch": 0.3496296296296296, |
|
"grad_norm": 0.23889886690383044, |
|
"kl": 0.07391357421875, |
|
"learning_rate": 8.39849685152679e-07, |
|
"loss": 0.0414, |
|
"reward": 1.0188361555337906, |
|
"reward_std": 0.5471197664737701, |
|
"rewards/accuracy_reward": 0.584821455180645, |
|
"rewards/cosine_scaled_reward": 0.4340147264301777, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 938.7991333007812, |
|
"epoch": 0.35259259259259257, |
|
"grad_norm": 0.26749074498213615, |
|
"kl": 0.0892333984375, |
|
"learning_rate": 8.36265199666956e-07, |
|
"loss": 0.0489, |
|
"reward": 1.4469529390335083, |
|
"reward_std": 0.43392040487378836, |
|
"rewards/accuracy_reward": 0.7812500298023224, |
|
"rewards/cosine_scaled_reward": 0.6657029464840889, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 842.6027221679688, |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.2779316922692756, |
|
"kl": 0.1029052734375, |
|
"learning_rate": 8.326499405210902e-07, |
|
"loss": 0.0706, |
|
"reward": 1.3225018680095673, |
|
"reward_std": 0.5245833843946457, |
|
"rewards/accuracy_reward": 0.7008928954601288, |
|
"rewards/cosine_scaled_reward": 0.6216090172529221, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 1482.9911193847656, |
|
"epoch": 0.3585185185185185, |
|
"grad_norm": 0.20900230828152339, |
|
"kl": 0.06805419921875, |
|
"learning_rate": 8.290042963573488e-07, |
|
"loss": -0.0292, |
|
"reward": 1.2458033114671707, |
|
"reward_std": 0.586730495095253, |
|
"rewards/accuracy_reward": 0.6875000447034836, |
|
"rewards/cosine_scaled_reward": 0.5583032667636871, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 899.1562805175781, |
|
"epoch": 0.36148148148148146, |
|
"grad_norm": 0.27477236439012975, |
|
"kl": 0.1024169921875, |
|
"learning_rate": 8.25328659084405e-07, |
|
"loss": 0.0473, |
|
"reward": 1.3379029631614685, |
|
"reward_std": 0.3929406702518463, |
|
"rewards/accuracy_reward": 0.7276785969734192, |
|
"rewards/cosine_scaled_reward": 0.6102243810892105, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 1149.5938262939453, |
|
"epoch": 0.36444444444444446, |
|
"grad_norm": 0.26020835723478647, |
|
"kl": 0.082275390625, |
|
"learning_rate": 8.216234238352065e-07, |
|
"loss": 0.0949, |
|
"reward": 1.515016108751297, |
|
"reward_std": 0.453274130821228, |
|
"rewards/accuracy_reward": 0.808035746216774, |
|
"rewards/cosine_scaled_reward": 0.7069803923368454, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 1142.1786193847656, |
|
"epoch": 0.3674074074074074, |
|
"grad_norm": 0.3235095014491872, |
|
"kl": 0.11279296875, |
|
"learning_rate": 8.178889889244996e-07, |
|
"loss": 0.0702, |
|
"reward": 1.3975563943386078, |
|
"reward_std": 0.2819400802254677, |
|
"rewards/accuracy_reward": 0.7500000298023224, |
|
"rewards/cosine_scaled_reward": 0.6475563049316406, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 876.8393249511719, |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.27873510040561517, |
|
"kl": 0.1097412109375, |
|
"learning_rate": 8.141257558060092e-07, |
|
"loss": -0.005, |
|
"reward": 1.4616929292678833, |
|
"reward_std": 0.5415130406618118, |
|
"rewards/accuracy_reward": 0.7678571790456772, |
|
"rewards/cosine_scaled_reward": 0.6938357651233673, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 1556.3661499023438, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.22490676274021995, |
|
"kl": 0.06610107421875, |
|
"learning_rate": 8.103341290292833e-07, |
|
"loss": -0.0147, |
|
"reward": 0.9707639068365097, |
|
"reward_std": 0.667090579867363, |
|
"rewards/accuracy_reward": 0.5714285895228386, |
|
"rewards/cosine_scaled_reward": 0.39933526888489723, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 1085.2723693847656, |
|
"epoch": 0.3762962962962963, |
|
"grad_norm": 0.2609771942189481, |
|
"kl": 0.087646484375, |
|
"learning_rate": 8.065145161962021e-07, |
|
"loss": 0.0467, |
|
"reward": 1.2797034680843353, |
|
"reward_std": 0.5504499524831772, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/cosine_scaled_reward": 0.5564891993999481, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 1202.2411193847656, |
|
"epoch": 0.37925925925925924, |
|
"grad_norm": 0.23147646598137125, |
|
"kl": 0.08319091796875, |
|
"learning_rate": 8.02667327917163e-07, |
|
"loss": -0.0018, |
|
"reward": 1.2336037755012512, |
|
"reward_std": 0.5526604950428009, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/cosine_scaled_reward": 0.5371751636266708, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 679.9955673217773, |
|
"epoch": 0.38222222222222224, |
|
"grad_norm": 0.6332260844439259, |
|
"kl": 0.1307373046875, |
|
"learning_rate": 7.987929777669372e-07, |
|
"loss": -0.0718, |
|
"reward": 1.3172721862792969, |
|
"reward_std": 0.49390799552202225, |
|
"rewards/accuracy_reward": 0.6964286118745804, |
|
"rewards/cosine_scaled_reward": 0.6208435744047165, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 1333.0089721679688, |
|
"epoch": 0.3851851851851852, |
|
"grad_norm": 0.2913669404852735, |
|
"kl": 0.0980224609375, |
|
"learning_rate": 7.948918822402123e-07, |
|
"loss": 0.0561, |
|
"reward": 1.1632278561592102, |
|
"reward_std": 0.45225123316049576, |
|
"rewards/accuracy_reward": 0.6383928805589676, |
|
"rewards/cosine_scaled_reward": 0.5248349532485008, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 1337.575942993164, |
|
"epoch": 0.38814814814814813, |
|
"grad_norm": 0.2186167579316726, |
|
"kl": 0.08392333984375, |
|
"learning_rate": 7.909644607068174e-07, |
|
"loss": 0.001, |
|
"reward": 1.1138341426849365, |
|
"reward_std": 0.5587991923093796, |
|
"rewards/accuracy_reward": 0.6339286118745804, |
|
"rewards/cosine_scaled_reward": 0.47990551590919495, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 1420.0491790771484, |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 0.2919760236776904, |
|
"kl": 0.09759521484375, |
|
"learning_rate": 7.870111353666414e-07, |
|
"loss": 0.0648, |
|
"reward": 1.2764753997325897, |
|
"reward_std": 0.5266730934381485, |
|
"rewards/accuracy_reward": 0.7053571790456772, |
|
"rewards/cosine_scaled_reward": 0.5711181536316872, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 1444.946517944336, |
|
"epoch": 0.3940740740740741, |
|
"grad_norm": 0.25522552398542675, |
|
"kl": 0.08349609375, |
|
"learning_rate": 7.830323312042464e-07, |
|
"loss": 0.0683, |
|
"reward": 1.188013032078743, |
|
"reward_std": 0.37928835675120354, |
|
"rewards/accuracy_reward": 0.6696428805589676, |
|
"rewards/cosine_scaled_reward": 0.5183701142668724, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 1024.1027145385742, |
|
"epoch": 0.397037037037037, |
|
"grad_norm": 0.26298616639534006, |
|
"kl": 0.10247802734375, |
|
"learning_rate": 7.790284759431809e-07, |
|
"loss": 0.0189, |
|
"reward": 1.3900260627269745, |
|
"reward_std": 0.30807338282465935, |
|
"rewards/accuracy_reward": 0.7500000298023224, |
|
"rewards/cosine_scaled_reward": 0.6400260329246521, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 1421.58935546875, |
|
"epoch": 0.4, |
|
"grad_norm": 0.24721365088048822, |
|
"kl": 0.07550048828125, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0589, |
|
"reward": 1.1417311877012253, |
|
"reward_std": 0.33604446426033974, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/cosine_scaled_reward": 0.5078025981783867, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 1269.888442993164, |
|
"epoch": 0.40296296296296297, |
|
"grad_norm": 0.27976499664133275, |
|
"kl": 0.104248046875, |
|
"learning_rate": 7.709473364379949e-07, |
|
"loss": 0.0435, |
|
"reward": 1.158977895975113, |
|
"reward_std": 0.3557581529021263, |
|
"rewards/accuracy_reward": 0.6473214626312256, |
|
"rewards/cosine_scaled_reward": 0.5116564705967903, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 1240.1295318603516, |
|
"epoch": 0.4059259259259259, |
|
"grad_norm": 0.26528035666816113, |
|
"kl": 0.117431640625, |
|
"learning_rate": 7.668709209206391e-07, |
|
"loss": -0.0069, |
|
"reward": 1.4490948617458344, |
|
"reward_std": 0.47412872314453125, |
|
"rewards/accuracy_reward": 0.7812500298023224, |
|
"rewards/cosine_scaled_reward": 0.6678448840975761, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 1057.0848693847656, |
|
"epoch": 0.4088888888888889, |
|
"grad_norm": 0.2625748064905278, |
|
"kl": 0.10546875, |
|
"learning_rate": 7.627711916647531e-07, |
|
"loss": 0.0357, |
|
"reward": 1.233490526676178, |
|
"reward_std": 0.48493514209985733, |
|
"rewards/accuracy_reward": 0.683035746216774, |
|
"rewards/cosine_scaled_reward": 0.5504548028111458, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 1540.4732666015625, |
|
"epoch": 0.41185185185185186, |
|
"grad_norm": 0.3012629488379583, |
|
"kl": 0.08453369140625, |
|
"learning_rate": 7.586485893933972e-07, |
|
"loss": 0.056, |
|
"reward": 1.2701046466827393, |
|
"reward_std": 0.5633516684174538, |
|
"rewards/accuracy_reward": 0.7053571790456772, |
|
"rewards/cosine_scaled_reward": 0.5647474825382233, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 1508.0804138183594, |
|
"epoch": 0.4148148148148148, |
|
"grad_norm": 0.24916923862969306, |
|
"kl": 0.10467529296875, |
|
"learning_rate": 7.545035572884928e-07, |
|
"loss": -0.0049, |
|
"reward": 1.271433025598526, |
|
"reward_std": 0.4527568593621254, |
|
"rewards/accuracy_reward": 0.7232143133878708, |
|
"rewards/cosine_scaled_reward": 0.5482186861336231, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 1373.6786193847656, |
|
"epoch": 0.4177777777777778, |
|
"grad_norm": 0.26167485604048296, |
|
"kl": 0.1085205078125, |
|
"learning_rate": 7.503365409431801e-07, |
|
"loss": 0.0049, |
|
"reward": 1.2524654418230057, |
|
"reward_std": 0.47800225764513016, |
|
"rewards/accuracy_reward": 0.6875000298023224, |
|
"rewards/cosine_scaled_reward": 0.5649654343724251, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 902.8259124755859, |
|
"epoch": 0.42074074074074075, |
|
"grad_norm": 0.36269394404753696, |
|
"kl": 0.154296875, |
|
"learning_rate": 7.46147988313917e-07, |
|
"loss": 0.0854, |
|
"reward": 1.6476789712905884, |
|
"reward_std": 0.4029630944132805, |
|
"rewards/accuracy_reward": 0.8616071790456772, |
|
"rewards/cosine_scaled_reward": 0.7860718369483948, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 1099.325942993164, |
|
"epoch": 0.4237037037037037, |
|
"grad_norm": 0.33429217324642896, |
|
"kl": 0.1253662109375, |
|
"learning_rate": 7.419383496723229e-07, |
|
"loss": 0.0115, |
|
"reward": 1.1484860479831696, |
|
"reward_std": 0.49667520076036453, |
|
"rewards/accuracy_reward": 0.6383928805589676, |
|
"rewards/cosine_scaled_reward": 0.510093130171299, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 1838.6786499023438, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.25993606256207835, |
|
"kl": 0.08837890625, |
|
"learning_rate": 7.377080775567751e-07, |
|
"loss": 0.1217, |
|
"reward": 0.9903404861688614, |
|
"reward_std": 0.387987844645977, |
|
"rewards/accuracy_reward": 0.558035746216774, |
|
"rewards/cosine_scaled_reward": 0.4323047176003456, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 953.6339797973633, |
|
"epoch": 0.42962962962962964, |
|
"grad_norm": 0.2719750141796096, |
|
"kl": 0.1390380859375, |
|
"learning_rate": 7.334576267237599e-07, |
|
"loss": 0.0446, |
|
"reward": 1.3718172013759613, |
|
"reward_std": 0.475243978202343, |
|
"rewards/accuracy_reward": 0.7321428805589676, |
|
"rewards/cosine_scaled_reward": 0.6396742761135101, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 1215.2634582519531, |
|
"epoch": 0.4325925925925926, |
|
"grad_norm": 0.27796212001837417, |
|
"kl": 0.111572265625, |
|
"learning_rate": 7.291874540989869e-07, |
|
"loss": -0.0992, |
|
"reward": 1.2015317529439926, |
|
"reward_std": 0.618925541639328, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/cosine_scaled_reward": 0.5408174768090248, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 1392.4286499023438, |
|
"epoch": 0.43555555555555553, |
|
"grad_norm": 0.28571629899742523, |
|
"kl": 0.123046875, |
|
"learning_rate": 7.248980187282679e-07, |
|
"loss": 0.0525, |
|
"reward": 1.1790579408407211, |
|
"reward_std": 0.47974304109811783, |
|
"rewards/accuracy_reward": 0.6607142984867096, |
|
"rewards/cosine_scaled_reward": 0.5183436721563339, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 1062.90185546875, |
|
"epoch": 0.43851851851851853, |
|
"grad_norm": 0.29903621450951867, |
|
"kl": 0.1376953125, |
|
"learning_rate": 7.205897817281707e-07, |
|
"loss": 0.0587, |
|
"reward": 1.2066510319709778, |
|
"reward_std": 0.550245389342308, |
|
"rewards/accuracy_reward": 0.6607143133878708, |
|
"rewards/cosine_scaled_reward": 0.545936681330204, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 781.2277069091797, |
|
"epoch": 0.4414814814814815, |
|
"grad_norm": 0.3594108148854528, |
|
"kl": 0.173583984375, |
|
"learning_rate": 7.162632062364482e-07, |
|
"loss": 0.0419, |
|
"reward": 1.5628540217876434, |
|
"reward_std": 0.3035361301153898, |
|
"rewards/accuracy_reward": 0.8214286118745804, |
|
"rewards/cosine_scaled_reward": 0.741425409913063, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 1027.4375610351562, |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.40168570307081203, |
|
"kl": 0.179443359375, |
|
"learning_rate": 7.119187573622503e-07, |
|
"loss": 0.0299, |
|
"reward": 1.208786502480507, |
|
"reward_std": 0.4775058552622795, |
|
"rewards/accuracy_reward": 0.647321455180645, |
|
"rewards/cosine_scaled_reward": 0.5614650174975395, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 1068.450912475586, |
|
"epoch": 0.4474074074074074, |
|
"grad_norm": 0.36202416334148413, |
|
"kl": 0.154541015625, |
|
"learning_rate": 7.075569021361258e-07, |
|
"loss": -0.0604, |
|
"reward": 1.1638824492692947, |
|
"reward_std": 0.6943438649177551, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/cosine_scaled_reward": 0.5299538299441338, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 1558.2411499023438, |
|
"epoch": 0.45037037037037037, |
|
"grad_norm": 0.3270636332627284, |
|
"kl": 0.1575927734375, |
|
"learning_rate": 7.031781094598147e-07, |
|
"loss": 0.0083, |
|
"reward": 1.0367525964975357, |
|
"reward_std": 0.5159935727715492, |
|
"rewards/accuracy_reward": 0.566964328289032, |
|
"rewards/cosine_scaled_reward": 0.4697883054614067, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 1267.9375457763672, |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 0.2601830199231623, |
|
"kl": 0.142333984375, |
|
"learning_rate": 6.987828500558422e-07, |
|
"loss": 0.0311, |
|
"reward": 1.200500175356865, |
|
"reward_std": 0.4817045107483864, |
|
"rewards/accuracy_reward": 0.6562500298023224, |
|
"rewards/cosine_scaled_reward": 0.5442501083016396, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 1155.1473693847656, |
|
"epoch": 0.4562962962962963, |
|
"grad_norm": 0.3657929469140216, |
|
"kl": 0.166748046875, |
|
"learning_rate": 6.943715964169153e-07, |
|
"loss": 0.0741, |
|
"reward": 1.30518639087677, |
|
"reward_std": 0.4314998611807823, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.5953649058938026, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 1294.2500305175781, |
|
"epoch": 0.45925925925925926, |
|
"grad_norm": 0.3025845364980038, |
|
"kl": 0.1407470703125, |
|
"learning_rate": 6.899448227551302e-07, |
|
"loss": 0.0423, |
|
"reward": 1.4187033772468567, |
|
"reward_std": 0.47818124294281006, |
|
"rewards/accuracy_reward": 0.7589286267757416, |
|
"rewards/cosine_scaled_reward": 0.6597748026251793, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 1628.3482971191406, |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 0.3033982983842964, |
|
"kl": 0.1275634765625, |
|
"learning_rate": 6.85503004950993e-07, |
|
"loss": -0.0077, |
|
"reward": 1.1377353817224503, |
|
"reward_std": 0.5007912814617157, |
|
"rewards/accuracy_reward": 0.6428571790456772, |
|
"rewards/cosine_scaled_reward": 0.4948781877756119, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 1181.90629196167, |
|
"epoch": 0.4651851851851852, |
|
"grad_norm": 0.31424031764619936, |
|
"kl": 0.1624755859375, |
|
"learning_rate": 6.810466205022635e-07, |
|
"loss": 0.0409, |
|
"reward": 1.3390273749828339, |
|
"reward_std": 0.41271302849054337, |
|
"rewards/accuracy_reward": 0.7500000298023224, |
|
"rewards/cosine_scaled_reward": 0.5890273749828339, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 1074.7947082519531, |
|
"epoch": 0.46814814814814815, |
|
"grad_norm": 0.35310245273581153, |
|
"kl": 0.1787109375, |
|
"learning_rate": 6.765761484726232e-07, |
|
"loss": -0.0245, |
|
"reward": 1.2675886452198029, |
|
"reward_std": 0.5314782559871674, |
|
"rewards/accuracy_reward": 0.683035746216774, |
|
"rewards/cosine_scaled_reward": 0.5845528990030289, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 1470.8170166015625, |
|
"epoch": 0.4711111111111111, |
|
"grad_norm": 0.31730711857033983, |
|
"kl": 0.12823486328125, |
|
"learning_rate": 6.720920694401765e-07, |
|
"loss": 0.011, |
|
"reward": 1.2019407004117966, |
|
"reward_std": 0.5537143424153328, |
|
"rewards/accuracy_reward": 0.6562500447034836, |
|
"rewards/cosine_scaled_reward": 0.5456906482577324, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 1101.3482666015625, |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 0.3749343248268511, |
|
"kl": 0.1827392578125, |
|
"learning_rate": 6.675948654457873e-07, |
|
"loss": 0.0726, |
|
"reward": 1.254590556025505, |
|
"reward_std": 0.40981949865818024, |
|
"rewards/accuracy_reward": 0.6696428805589676, |
|
"rewards/cosine_scaled_reward": 0.5849476233124733, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 864.892894744873, |
|
"epoch": 0.47703703703703704, |
|
"grad_norm": 0.4140143134046075, |
|
"kl": 0.224365234375, |
|
"learning_rate": 6.6308501994126e-07, |
|
"loss": 0.07, |
|
"reward": 1.4429502189159393, |
|
"reward_std": 0.4082343354821205, |
|
"rewards/accuracy_reward": 0.7678571790456772, |
|
"rewards/cosine_scaled_reward": 0.6750930473208427, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 1032.7545013427734, |
|
"epoch": 0.48, |
|
"grad_norm": 0.38927414518235437, |
|
"kl": 0.18310546875, |
|
"learning_rate": 6.585630177373679e-07, |
|
"loss": 0.0317, |
|
"reward": 1.3784838616847992, |
|
"reward_std": 0.6301179528236389, |
|
"rewards/accuracy_reward": 0.736607164144516, |
|
"rewards/cosine_scaled_reward": 0.641876682639122, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 1148.0491790771484, |
|
"epoch": 0.482962962962963, |
|
"grad_norm": 0.37818983157554104, |
|
"kl": 0.20947265625, |
|
"learning_rate": 6.540293449517364e-07, |
|
"loss": 0.1298, |
|
"reward": 1.0715374201536179, |
|
"reward_std": 0.560625359416008, |
|
"rewards/accuracy_reward": 0.6026786044239998, |
|
"rewards/cosine_scaled_reward": 0.4688587933778763, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 1020.1384429931641, |
|
"epoch": 0.48592592592592593, |
|
"grad_norm": 0.5055960507298748, |
|
"kl": 0.2374267578125, |
|
"learning_rate": 6.494844889565838e-07, |
|
"loss": -0.0305, |
|
"reward": 1.3220538794994354, |
|
"reward_std": 0.5499390736222267, |
|
"rewards/accuracy_reward": 0.7008928805589676, |
|
"rewards/cosine_scaled_reward": 0.6211609840393066, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 1183.3348693847656, |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 0.43271458690777626, |
|
"kl": 0.197998046875, |
|
"learning_rate": 6.449289383263299e-07, |
|
"loss": 0.0018, |
|
"reward": 1.2181346118450165, |
|
"reward_std": 0.5314186587929726, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/cosine_scaled_reward": 0.5574202537536621, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1187.1339721679688, |
|
"epoch": 0.4918518518518519, |
|
"grad_norm": 0.34390574748275693, |
|
"kl": 0.18408203125, |
|
"learning_rate": 6.403631827850733e-07, |
|
"loss": -0.001, |
|
"reward": 1.299418330192566, |
|
"reward_std": 0.4987459257245064, |
|
"rewards/accuracy_reward": 0.7098214775323868, |
|
"rewards/cosine_scaled_reward": 0.5895968675613403, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 1049.1652221679688, |
|
"epoch": 0.4948148148148148, |
|
"grad_norm": 0.43196078764273776, |
|
"kl": 0.233154296875, |
|
"learning_rate": 6.357877131539459e-07, |
|
"loss": -0.0543, |
|
"reward": 1.2178914546966553, |
|
"reward_std": 0.6870964467525482, |
|
"rewards/accuracy_reward": 0.651785746216774, |
|
"rewards/cosine_scaled_reward": 0.5661056637763977, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 889.2053985595703, |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 0.5031433742478459, |
|
"kl": 0.280517578125, |
|
"learning_rate": 6.312030212983492e-07, |
|
"loss": 0.0753, |
|
"reward": 1.401370495557785, |
|
"reward_std": 0.5911066308617592, |
|
"rewards/accuracy_reward": 0.736607164144516, |
|
"rewards/cosine_scaled_reward": 0.6647634506225586, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 947.0134506225586, |
|
"epoch": 0.5007407407407407, |
|
"grad_norm": 0.33802955155948805, |
|
"kl": 0.206787109375, |
|
"learning_rate": 6.266096000750794e-07, |
|
"loss": -0.0208, |
|
"reward": 1.2584110498428345, |
|
"reward_std": 0.5075518116354942, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/cosine_scaled_reward": 0.5798396170139313, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 1148.357192993164, |
|
"epoch": 0.5037037037037037, |
|
"grad_norm": 0.3869037274705203, |
|
"kl": 0.21875, |
|
"learning_rate": 6.220079432793434e-07, |
|
"loss": -0.06, |
|
"reward": 1.2284764647483826, |
|
"reward_std": 0.4915821775794029, |
|
"rewards/accuracy_reward": 0.674107164144516, |
|
"rewards/cosine_scaled_reward": 0.5543693378567696, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 1371.2634582519531, |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 0.3786413188449613, |
|
"kl": 0.236572265625, |
|
"learning_rate": 6.173985455916767e-07, |
|
"loss": 0.0856, |
|
"reward": 1.22401562333107, |
|
"reward_std": 0.44081344455480576, |
|
"rewards/accuracy_reward": 0.6696428954601288, |
|
"rewards/cosine_scaled_reward": 0.5543726608157158, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 1635.5670166015625, |
|
"epoch": 0.5096296296296297, |
|
"grad_norm": 0.3542416380801021, |
|
"kl": 0.1768798828125, |
|
"learning_rate": 6.127819025247654e-07, |
|
"loss": -0.021, |
|
"reward": 0.9518559873104095, |
|
"reward_std": 0.6247570067644119, |
|
"rewards/accuracy_reward": 0.5446428805589676, |
|
"rewards/cosine_scaled_reward": 0.40721310302615166, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 696.8839721679688, |
|
"epoch": 0.5125925925925926, |
|
"grad_norm": 0.44284735926034635, |
|
"kl": 0.224609375, |
|
"learning_rate": 6.081585103701769e-07, |
|
"loss": 0.0629, |
|
"reward": 1.3010612279176712, |
|
"reward_std": 0.4532425180077553, |
|
"rewards/accuracy_reward": 0.7008928954601288, |
|
"rewards/cosine_scaled_reward": 0.6001683697104454, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 973.544677734375, |
|
"epoch": 0.5155555555555555, |
|
"grad_norm": 0.36960232543289, |
|
"kl": 0.21728515625, |
|
"learning_rate": 6.0352886614501e-07, |
|
"loss": 0.0488, |
|
"reward": 1.1753744930028915, |
|
"reward_std": 0.5731424987316132, |
|
"rewards/accuracy_reward": 0.642857164144516, |
|
"rewards/cosine_scaled_reward": 0.5325173661112785, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 1180.0045013427734, |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.3761170899059127, |
|
"kl": 0.235595703125, |
|
"learning_rate": 5.988934675384635e-07, |
|
"loss": -0.0849, |
|
"reward": 1.1396174132823944, |
|
"reward_std": 0.6925529539585114, |
|
"rewards/accuracy_reward": 0.611607164144516, |
|
"rewards/cosine_scaled_reward": 0.5280102342367172, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 518.2946701049805, |
|
"epoch": 0.5214814814814814, |
|
"grad_norm": 0.5269257185741734, |
|
"kl": 0.3154296875, |
|
"learning_rate": 5.942528128583356e-07, |
|
"loss": 0.0309, |
|
"reward": 1.181448057293892, |
|
"reward_std": 0.5618361011147499, |
|
"rewards/accuracy_reward": 0.6205357387661934, |
|
"rewards/cosine_scaled_reward": 0.5609123036265373, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 1084.8616485595703, |
|
"epoch": 0.5244444444444445, |
|
"grad_norm": 0.385748212876016, |
|
"kl": 0.22021484375, |
|
"learning_rate": 5.896074009774554e-07, |
|
"loss": 0.0062, |
|
"reward": 1.270677775144577, |
|
"reward_std": 0.5810166075825691, |
|
"rewards/accuracy_reward": 0.6875000298023224, |
|
"rewards/cosine_scaled_reward": 0.5831777602434158, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 1041.5045013427734, |
|
"epoch": 0.5274074074074074, |
|
"grad_norm": 0.46881526757275843, |
|
"kl": 0.24072265625, |
|
"learning_rate": 5.849577312800529e-07, |
|
"loss": 0.0565, |
|
"reward": 1.2370425462722778, |
|
"reward_std": 0.6740812063217163, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/cosine_scaled_reward": 0.5763282403349876, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 728.0402221679688, |
|
"epoch": 0.5303703703703704, |
|
"grad_norm": 0.470067894180888, |
|
"kl": 0.25830078125, |
|
"learning_rate": 5.803043036080764e-07, |
|
"loss": 0.0428, |
|
"reward": 1.3547908663749695, |
|
"reward_std": 0.5294669568538666, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/cosine_scaled_reward": 0.6315765678882599, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 1242.6027221679688, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.4591428088932079, |
|
"kl": 0.21435546875, |
|
"learning_rate": 5.756476182074582e-07, |
|
"loss": 0.0083, |
|
"reward": 1.2970826625823975, |
|
"reward_std": 0.5159892141819, |
|
"rewards/accuracy_reward": 0.7008928954601288, |
|
"rewards/cosine_scaled_reward": 0.5961898565292358, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 1337.1607666015625, |
|
"epoch": 0.5362962962962963, |
|
"grad_norm": 0.3661728892369155, |
|
"kl": 0.197021484375, |
|
"learning_rate": 5.709881756743379e-07, |
|
"loss": 0.0089, |
|
"reward": 1.1403344869613647, |
|
"reward_std": 0.5260699540376663, |
|
"rewards/accuracy_reward": 0.6294643059372902, |
|
"rewards/cosine_scaled_reward": 0.5108701921999454, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 919.8214836120605, |
|
"epoch": 0.5392592592592592, |
|
"grad_norm": 0.3895111477578375, |
|
"kl": 0.243408203125, |
|
"learning_rate": 5.663264769012486e-07, |
|
"loss": -0.068, |
|
"reward": 1.4501311480998993, |
|
"reward_std": 0.4405294507741928, |
|
"rewards/accuracy_reward": 0.7678571790456772, |
|
"rewards/cosine_scaled_reward": 0.6822739690542221, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 1210.0982818603516, |
|
"epoch": 0.5422222222222223, |
|
"grad_norm": 0.3130809107538841, |
|
"kl": 0.18115234375, |
|
"learning_rate": 5.616630230232704e-07, |
|
"loss": 0.0747, |
|
"reward": 1.360123723745346, |
|
"reward_std": 0.4805947467684746, |
|
"rewards/accuracy_reward": 0.7321428954601288, |
|
"rewards/cosine_scaled_reward": 0.6279808431863785, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 1449.9197082519531, |
|
"epoch": 0.5451851851851852, |
|
"grad_norm": 0.4137201964542162, |
|
"kl": 0.166748046875, |
|
"learning_rate": 5.569983153641579e-07, |
|
"loss": 0.0794, |
|
"reward": 1.2458688914775848, |
|
"reward_std": 0.4530060738325119, |
|
"rewards/accuracy_reward": 0.6741071715950966, |
|
"rewards/cosine_scaled_reward": 0.5717617124319077, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 1297.200942993164, |
|
"epoch": 0.5481481481481482, |
|
"grad_norm": 0.40557122894404823, |
|
"kl": 0.19580078125, |
|
"learning_rate": 5.523328553824479e-07, |
|
"loss": 0.1124, |
|
"reward": 1.176156997680664, |
|
"reward_std": 0.5489060133695602, |
|
"rewards/accuracy_reward": 0.6651786118745804, |
|
"rewards/cosine_scaled_reward": 0.5109783783555031, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 1219.9420166015625, |
|
"epoch": 0.5511111111111111, |
|
"grad_norm": 0.4465073109866194, |
|
"kl": 0.194091796875, |
|
"learning_rate": 5.476671446175522e-07, |
|
"loss": 0.0069, |
|
"reward": 1.1364451944828033, |
|
"reward_std": 0.6480904817581177, |
|
"rewards/accuracy_reward": 0.6205357387661934, |
|
"rewards/cosine_scaled_reward": 0.5159093961119652, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 861.9553833007812, |
|
"epoch": 0.554074074074074, |
|
"grad_norm": 0.4575321909035033, |
|
"kl": 0.215087890625, |
|
"learning_rate": 5.43001684635842e-07, |
|
"loss": 0.0884, |
|
"reward": 1.6217327415943146, |
|
"reward_std": 0.38701897859573364, |
|
"rewards/accuracy_reward": 0.870535746216774, |
|
"rewards/cosine_scaled_reward": 0.7511969804763794, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1326.415267944336, |
|
"epoch": 0.557037037037037, |
|
"grad_norm": 0.39815832938233886, |
|
"kl": 0.18359375, |
|
"learning_rate": 5.383369769767296e-07, |
|
"loss": 0.0396, |
|
"reward": 1.2991815507411957, |
|
"reward_std": 0.5862655192613602, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.5893600881099701, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 1835.3527221679688, |
|
"epoch": 0.56, |
|
"grad_norm": 0.31642380883582527, |
|
"kl": 0.13916015625, |
|
"learning_rate": 5.336735230987514e-07, |
|
"loss": 0.079, |
|
"reward": 1.0685685127973557, |
|
"reward_std": 0.5348366796970367, |
|
"rewards/accuracy_reward": 0.6160714477300644, |
|
"rewards/cosine_scaled_reward": 0.45249706506729126, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 1354.2053985595703, |
|
"epoch": 0.562962962962963, |
|
"grad_norm": 0.37730924544753974, |
|
"kl": 0.196533203125, |
|
"learning_rate": 5.290118243256622e-07, |
|
"loss": -0.0122, |
|
"reward": 1.1858795583248138, |
|
"reward_std": 0.40029022842645645, |
|
"rewards/accuracy_reward": 0.6741071715950966, |
|
"rewards/cosine_scaled_reward": 0.5117723196744919, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 1333.5045318603516, |
|
"epoch": 0.5659259259259259, |
|
"grad_norm": 0.3191966963841145, |
|
"kl": 0.1488037109375, |
|
"learning_rate": 5.243523817925418e-07, |
|
"loss": 0.0125, |
|
"reward": 1.1546119302511215, |
|
"reward_std": 0.41448020190000534, |
|
"rewards/accuracy_reward": 0.6383928954601288, |
|
"rewards/cosine_scaled_reward": 0.5162190869450569, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 2106.4197387695312, |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 0.28781845531452005, |
|
"kl": 0.11285400390625, |
|
"learning_rate": 5.196956963919237e-07, |
|
"loss": 0.1116, |
|
"reward": 1.1330502927303314, |
|
"reward_std": 0.55214674025774, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/cosine_scaled_reward": 0.4812645688652992, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 1593.9866943359375, |
|
"epoch": 0.5718518518518518, |
|
"grad_norm": 0.2628953319964047, |
|
"kl": 0.1348876953125, |
|
"learning_rate": 5.150422687199471e-07, |
|
"loss": 0.0287, |
|
"reward": 1.3014012575149536, |
|
"reward_std": 0.47693130373954773, |
|
"rewards/accuracy_reward": 0.7232143133878708, |
|
"rewards/cosine_scaled_reward": 0.5781868472695351, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 1275.2366790771484, |
|
"epoch": 0.5748148148148148, |
|
"grad_norm": 0.2994379062128194, |
|
"kl": 0.195068359375, |
|
"learning_rate": 5.103925990225448e-07, |
|
"loss": 0.02, |
|
"reward": 1.2638274729251862, |
|
"reward_std": 0.3140847235918045, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/cosine_scaled_reward": 0.5852559804916382, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 895.9241409301758, |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 0.46329426811686814, |
|
"kl": 0.22216796875, |
|
"learning_rate": 5.057471871416644e-07, |
|
"loss": 0.1082, |
|
"reward": 1.454516351222992, |
|
"reward_std": 0.36672039702534676, |
|
"rewards/accuracy_reward": 0.776785746216774, |
|
"rewards/cosine_scaled_reward": 0.6777307093143463, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 2187.2054748535156, |
|
"epoch": 0.5807407407407408, |
|
"grad_norm": 0.2967868062982687, |
|
"kl": 0.1002197265625, |
|
"learning_rate": 5.011065324615364e-07, |
|
"loss": 0.0748, |
|
"reward": 1.204501986503601, |
|
"reward_std": 0.5387090295553207, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/cosine_scaled_reward": 0.4991448149085045, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 1070.044677734375, |
|
"epoch": 0.5837037037037037, |
|
"grad_norm": 0.3698858062676789, |
|
"kl": 0.1868896484375, |
|
"learning_rate": 4.964711338549901e-07, |
|
"loss": 0.0298, |
|
"reward": 1.3357891142368317, |
|
"reward_std": 0.4346286430954933, |
|
"rewards/accuracy_reward": 0.7276786118745804, |
|
"rewards/cosine_scaled_reward": 0.6081104874610901, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 1416.6027526855469, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.30171989247018693, |
|
"kl": 0.1600341796875, |
|
"learning_rate": 4.918414896298229e-07, |
|
"loss": -0.04, |
|
"reward": 1.2265494465827942, |
|
"reward_std": 0.45234786719083786, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/cosine_scaled_reward": 0.5479779690504074, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 868.9062881469727, |
|
"epoch": 0.5896296296296296, |
|
"grad_norm": 0.42429433131812017, |
|
"kl": 0.213134765625, |
|
"learning_rate": 4.872180974752347e-07, |
|
"loss": -0.0113, |
|
"reward": 1.5499212741851807, |
|
"reward_std": 0.3130173161625862, |
|
"rewards/accuracy_reward": 0.8080357611179352, |
|
"rewards/cosine_scaled_reward": 0.7418854981660843, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 1253.1206359863281, |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.3397983148086048, |
|
"kl": 0.179931640625, |
|
"learning_rate": 4.826014544083234e-07, |
|
"loss": 0.0647, |
|
"reward": 1.3181427121162415, |
|
"reward_std": 0.46541793644428253, |
|
"rewards/accuracy_reward": 0.714285746216774, |
|
"rewards/cosine_scaled_reward": 0.6038569808006287, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1317.982192993164, |
|
"epoch": 0.5955555555555555, |
|
"grad_norm": 0.3438713760881616, |
|
"kl": 0.1781005859375, |
|
"learning_rate": 4.779920567206568e-07, |
|
"loss": -0.0592, |
|
"reward": 1.4068642556667328, |
|
"reward_std": 0.46310556679964066, |
|
"rewards/accuracy_reward": 0.7544643133878708, |
|
"rewards/cosine_scaled_reward": 0.6523999273777008, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1139.3973846435547, |
|
"epoch": 0.5985185185185186, |
|
"grad_norm": 0.4728872141325827, |
|
"kl": 0.205810546875, |
|
"learning_rate": 4.733903999249206e-07, |
|
"loss": 0.1675, |
|
"reward": 1.2444301843643188, |
|
"reward_std": 0.29916173219680786, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/cosine_scaled_reward": 0.5658586621284485, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1130.0357666015625, |
|
"epoch": 0.6014814814814815, |
|
"grad_norm": 0.4322501072269441, |
|
"kl": 0.182861328125, |
|
"learning_rate": 4.687969787016507e-07, |
|
"loss": 0.09, |
|
"reward": 1.3736966401338577, |
|
"reward_std": 0.38464444130659103, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/cosine_scaled_reward": 0.632625162601471, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 1880.1072387695312, |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 0.25334618537099585, |
|
"kl": 0.105926513671875, |
|
"learning_rate": 4.642122868460542e-07, |
|
"loss": 0.0293, |
|
"reward": 1.2221907079219818, |
|
"reward_std": 0.4497520886361599, |
|
"rewards/accuracy_reward": 0.683035746216774, |
|
"rewards/cosine_scaled_reward": 0.5391549617052078, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 1142.4330596923828, |
|
"epoch": 0.6074074074074074, |
|
"grad_norm": 0.41389278300030885, |
|
"kl": 0.2017822265625, |
|
"learning_rate": 4.596368172149268e-07, |
|
"loss": -0.0062, |
|
"reward": 1.0300216674804688, |
|
"reward_std": 0.5647179707884789, |
|
"rewards/accuracy_reward": 0.5669643133878708, |
|
"rewards/cosine_scaled_reward": 0.46305735409259796, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 1438.7411193847656, |
|
"epoch": 0.6103703703703703, |
|
"grad_norm": 0.381533587277155, |
|
"kl": 0.18603515625, |
|
"learning_rate": 4.550710616736702e-07, |
|
"loss": 0.0456, |
|
"reward": 1.358052909374237, |
|
"reward_std": 0.3910771645605564, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/cosine_scaled_reward": 0.6169814988970757, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1475.946533203125, |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 0.3649152348536222, |
|
"kl": 0.186767578125, |
|
"learning_rate": 4.505155110434162e-07, |
|
"loss": 0.0661, |
|
"reward": 1.3614622950553894, |
|
"reward_std": 0.43088656663894653, |
|
"rewards/accuracy_reward": 0.7366071790456772, |
|
"rewards/cosine_scaled_reward": 0.624855138361454, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 1380.1295471191406, |
|
"epoch": 0.6162962962962963, |
|
"grad_norm": 0.36924439281566757, |
|
"kl": 0.17333984375, |
|
"learning_rate": 4.459706550482638e-07, |
|
"loss": -0.0171, |
|
"reward": 1.2316114753484726, |
|
"reward_std": 0.5087971612811089, |
|
"rewards/accuracy_reward": 0.6741071790456772, |
|
"rewards/cosine_scaled_reward": 0.5575042814016342, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 1470.2590026855469, |
|
"epoch": 0.6192592592592593, |
|
"grad_norm": 0.3980563161448672, |
|
"kl": 0.168212890625, |
|
"learning_rate": 4.4143698226263207e-07, |
|
"loss": 0.0399, |
|
"reward": 1.2171460092067719, |
|
"reward_std": 0.6463766992092133, |
|
"rewards/accuracy_reward": 0.6696428805589676, |
|
"rewards/cosine_scaled_reward": 0.5475031360983849, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 899.7411193847656, |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 0.4032913849721601, |
|
"kl": 0.2008056640625, |
|
"learning_rate": 4.3691498005874007e-07, |
|
"loss": 0.0048, |
|
"reward": 1.4577341675758362, |
|
"reward_std": 0.539386659860611, |
|
"rewards/accuracy_reward": 0.7767857611179352, |
|
"rewards/cosine_scaled_reward": 0.6809485107660294, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 1152.5804138183594, |
|
"epoch": 0.6251851851851852, |
|
"grad_norm": 0.35080155719175315, |
|
"kl": 0.173095703125, |
|
"learning_rate": 4.324051345542128e-07, |
|
"loss": 0.0312, |
|
"reward": 1.2082395255565643, |
|
"reward_std": 0.46680425107479095, |
|
"rewards/accuracy_reward": 0.6562500298023224, |
|
"rewards/cosine_scaled_reward": 0.5519895032048225, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 1412.0268249511719, |
|
"epoch": 0.6281481481481481, |
|
"grad_norm": 0.36033713983943594, |
|
"kl": 0.186279296875, |
|
"learning_rate": 4.2790793055982354e-07, |
|
"loss": 0.1063, |
|
"reward": 1.1815235912799835, |
|
"reward_std": 0.4442535899579525, |
|
"rewards/accuracy_reward": 0.6562500149011612, |
|
"rewards/cosine_scaled_reward": 0.5252735912799835, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1060.7143249511719, |
|
"epoch": 0.6311111111111111, |
|
"grad_norm": 0.46894353747180056, |
|
"kl": 0.222900390625, |
|
"learning_rate": 4.234238515273768e-07, |
|
"loss": 0.0439, |
|
"reward": 1.5049341022968292, |
|
"reward_std": 0.44284530729055405, |
|
"rewards/accuracy_reward": 0.7991071790456772, |
|
"rewards/cosine_scaled_reward": 0.7058268785476685, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 1348.3304290771484, |
|
"epoch": 0.6340740740740741, |
|
"grad_norm": 0.4055925514471472, |
|
"kl": 0.213134765625, |
|
"learning_rate": 4.189533794977367e-07, |
|
"loss": 0.118, |
|
"reward": 1.252614676952362, |
|
"reward_std": 0.442756824195385, |
|
"rewards/accuracy_reward": 0.6875000447034836, |
|
"rewards/cosine_scaled_reward": 0.5651145875453949, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 1562.0982513427734, |
|
"epoch": 0.6370370370370371, |
|
"grad_norm": 0.3036559653599386, |
|
"kl": 0.1678466796875, |
|
"learning_rate": 4.14496995049007e-07, |
|
"loss": 0.0349, |
|
"reward": 1.2328214347362518, |
|
"reward_std": 0.5444767251610756, |
|
"rewards/accuracy_reward": 0.6741071790456772, |
|
"rewards/cosine_scaled_reward": 0.5587142258882523, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 1202.1116638183594, |
|
"epoch": 0.64, |
|
"grad_norm": 0.37347997620862683, |
|
"kl": 0.21240234375, |
|
"learning_rate": 4.100551772448697e-07, |
|
"loss": 0.1321, |
|
"reward": 1.2631460428237915, |
|
"reward_std": 0.43043725937604904, |
|
"rewards/accuracy_reward": 0.6919643133878708, |
|
"rewards/cosine_scaled_reward": 0.5711817443370819, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 1593.05810546875, |
|
"epoch": 0.642962962962963, |
|
"grad_norm": 0.3224052157606133, |
|
"kl": 0.183837890625, |
|
"learning_rate": 4.056284035830846e-07, |
|
"loss": -0.0014, |
|
"reward": 1.0634922683238983, |
|
"reward_std": 0.5580763593316078, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/cosine_scaled_reward": 0.44742076098918915, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 1347.2098693847656, |
|
"epoch": 0.6459259259259259, |
|
"grad_norm": 0.37978234862418775, |
|
"kl": 0.19873046875, |
|
"learning_rate": 4.012171499441578e-07, |
|
"loss": -0.0295, |
|
"reward": 1.2675736546516418, |
|
"reward_std": 0.4535221755504608, |
|
"rewards/accuracy_reward": 0.6964286118745804, |
|
"rewards/cosine_scaled_reward": 0.5711449980735779, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 978.2991638183594, |
|
"epoch": 0.6488888888888888, |
|
"grad_norm": 0.3878234235322433, |
|
"kl": 0.216796875, |
|
"learning_rate": 3.968218905401853e-07, |
|
"loss": 0.068, |
|
"reward": 1.36513289809227, |
|
"reward_std": 0.6089888289570808, |
|
"rewards/accuracy_reward": 0.7276786267757416, |
|
"rewards/cosine_scaled_reward": 0.6374543011188507, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 1114.9018249511719, |
|
"epoch": 0.6518518518518519, |
|
"grad_norm": 0.3757339213906905, |
|
"kl": 0.216064453125, |
|
"learning_rate": 3.924430978638742e-07, |
|
"loss": 0.0546, |
|
"reward": 1.0311194062232971, |
|
"reward_std": 0.5321889817714691, |
|
"rewards/accuracy_reward": 0.5625000298023224, |
|
"rewards/cosine_scaled_reward": 0.46861938387155533, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 1431.6875915527344, |
|
"epoch": 0.6548148148148148, |
|
"grad_norm": 0.38591627549365615, |
|
"kl": 0.21826171875, |
|
"learning_rate": 3.8808124263774955e-07, |
|
"loss": 0.0098, |
|
"reward": 1.2712106704711914, |
|
"reward_std": 0.5642440319061279, |
|
"rewards/accuracy_reward": 0.6875000447034836, |
|
"rewards/cosine_scaled_reward": 0.583710677921772, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 1437.6116943359375, |
|
"epoch": 0.6577777777777778, |
|
"grad_norm": 0.3628737795234154, |
|
"kl": 0.210205078125, |
|
"learning_rate": 3.8373679376355195e-07, |
|
"loss": 0.006, |
|
"reward": 0.9208376854658127, |
|
"reward_std": 0.5681522116065025, |
|
"rewards/accuracy_reward": 0.5357143059372902, |
|
"rewards/cosine_scaled_reward": 0.3851233683526516, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 1019.7589416503906, |
|
"epoch": 0.6607407407407407, |
|
"grad_norm": 0.4796974552820394, |
|
"kl": 0.23583984375, |
|
"learning_rate": 3.794102182718294e-07, |
|
"loss": -0.0979, |
|
"reward": 1.2191343009471893, |
|
"reward_std": 0.509204089641571, |
|
"rewards/accuracy_reward": 0.651785746216774, |
|
"rewards/cosine_scaled_reward": 0.5673485770821571, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 1058.0357360839844, |
|
"epoch": 0.6637037037037037, |
|
"grad_norm": 0.46320248751547094, |
|
"kl": 0.266357421875, |
|
"learning_rate": 3.751019812717322e-07, |
|
"loss": -0.0134, |
|
"reward": 1.1183428168296814, |
|
"reward_std": 0.6392548233270645, |
|
"rewards/accuracy_reward": 0.6071428805589676, |
|
"rewards/cosine_scaled_reward": 0.5111999437212944, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1300.5268249511719, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.4371531247021142, |
|
"kl": 0.274169921875, |
|
"learning_rate": 3.708125459010134e-07, |
|
"loss": -0.1225, |
|
"reward": 1.1466220319271088, |
|
"reward_std": 0.5585737600922585, |
|
"rewards/accuracy_reward": 0.611607164144516, |
|
"rewards/cosine_scaled_reward": 0.5350148379802704, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 1168.6652374267578, |
|
"epoch": 0.6696296296296296, |
|
"grad_norm": 0.5078015018739912, |
|
"kl": 0.24072265625, |
|
"learning_rate": 3.6654237327624003e-07, |
|
"loss": 0.1142, |
|
"reward": 1.0346617102622986, |
|
"reward_std": 0.5203389897942543, |
|
"rewards/accuracy_reward": 0.580357164144516, |
|
"rewards/cosine_scaled_reward": 0.454304538667202, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 1110.995590209961, |
|
"epoch": 0.6725925925925926, |
|
"grad_norm": 0.4665048248523059, |
|
"kl": 0.264404296875, |
|
"learning_rate": 3.622919224432248e-07, |
|
"loss": -0.074, |
|
"reward": 1.2056891322135925, |
|
"reward_std": 0.6480113118886948, |
|
"rewards/accuracy_reward": 0.642857164144516, |
|
"rewards/cosine_scaled_reward": 0.5628319680690765, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1133.5313186645508, |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 0.3798907809838722, |
|
"kl": 0.2587890625, |
|
"learning_rate": 3.580616503276772e-07, |
|
"loss": 0.0722, |
|
"reward": 1.1472938358783722, |
|
"reward_std": 0.6741429939866066, |
|
"rewards/accuracy_reward": 0.620535746216774, |
|
"rewards/cosine_scaled_reward": 0.5267581045627594, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 1382.5670318603516, |
|
"epoch": 0.6785185185185185, |
|
"grad_norm": 0.4236817109835327, |
|
"kl": 0.204345703125, |
|
"learning_rate": 3.5385201168608303e-07, |
|
"loss": 0.0165, |
|
"reward": 1.1142818331718445, |
|
"reward_std": 0.49922633171081543, |
|
"rewards/accuracy_reward": 0.6383928954601288, |
|
"rewards/cosine_scaled_reward": 0.4758888818323612, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 1587.8170318603516, |
|
"epoch": 0.6814814814814815, |
|
"grad_norm": 0.3672380211269815, |
|
"kl": 0.1927490234375, |
|
"learning_rate": 3.4966345905681984e-07, |
|
"loss": 0.0907, |
|
"reward": 1.0480027794837952, |
|
"reward_std": 0.713165745139122, |
|
"rewards/accuracy_reward": 0.6116071790456772, |
|
"rewards/cosine_scaled_reward": 0.4363955929875374, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1256.8348846435547, |
|
"epoch": 0.6844444444444444, |
|
"grad_norm": 0.33221266123133913, |
|
"kl": 0.22412109375, |
|
"learning_rate": 3.4549644271150723e-07, |
|
"loss": -0.0146, |
|
"reward": 1.2618545591831207, |
|
"reward_std": 0.4967653974890709, |
|
"rewards/accuracy_reward": 0.6875000298023224, |
|
"rewards/cosine_scaled_reward": 0.5743545740842819, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1367.9197235107422, |
|
"epoch": 0.6874074074074074, |
|
"grad_norm": 0.387482879229943, |
|
"kl": 0.236572265625, |
|
"learning_rate": 3.413514106066026e-07, |
|
"loss": -0.0083, |
|
"reward": 1.1527521908283234, |
|
"reward_std": 0.5814446583390236, |
|
"rewards/accuracy_reward": 0.6339286118745804, |
|
"rewards/cosine_scaled_reward": 0.518823616206646, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 1276.5402221679688, |
|
"epoch": 0.6903703703703704, |
|
"grad_norm": 0.46231425089597195, |
|
"kl": 0.236083984375, |
|
"learning_rate": 3.3722880833524704e-07, |
|
"loss": -0.0412, |
|
"reward": 1.0933943092823029, |
|
"reward_std": 0.6421699896454811, |
|
"rewards/accuracy_reward": 0.611607164144516, |
|
"rewards/cosine_scaled_reward": 0.48178714513778687, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 721.794677734375, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.5338257753145169, |
|
"kl": 0.31005859375, |
|
"learning_rate": 3.3312907907936097e-07, |
|
"loss": 0.0071, |
|
"reward": 1.3637142181396484, |
|
"reward_std": 0.46142444014549255, |
|
"rewards/accuracy_reward": 0.714285746216774, |
|
"rewards/cosine_scaled_reward": 0.6494284868240356, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 1033.7678909301758, |
|
"epoch": 0.6962962962962963, |
|
"grad_norm": 0.43977586309359473, |
|
"kl": 0.251220703125, |
|
"learning_rate": 3.2905266356200506e-07, |
|
"loss": 0.0159, |
|
"reward": 1.279816746711731, |
|
"reward_std": 0.5453010722994804, |
|
"rewards/accuracy_reward": 0.6919643133878708, |
|
"rewards/cosine_scaled_reward": 0.587852418422699, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 1518.8437805175781, |
|
"epoch": 0.6992592592592592, |
|
"grad_norm": 0.40240053570452927, |
|
"kl": 0.21875, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0586, |
|
"reward": 1.1353959143161774, |
|
"reward_std": 0.4257539436221123, |
|
"rewards/accuracy_reward": 0.6339286118745804, |
|
"rewards/cosine_scaled_reward": 0.5014673247933388, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1218.2813110351562, |
|
"epoch": 0.7022222222222222, |
|
"grad_norm": 0.42429461890467546, |
|
"kl": 0.242919921875, |
|
"learning_rate": 3.2097152405681904e-07, |
|
"loss": -0.0396, |
|
"reward": 1.2611887753009796, |
|
"reward_std": 0.6024208590388298, |
|
"rewards/accuracy_reward": 0.7008928805589676, |
|
"rewards/cosine_scaled_reward": 0.560295857489109, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 1438.3527221679688, |
|
"epoch": 0.7051851851851851, |
|
"grad_norm": 0.3284405554836145, |
|
"kl": 0.1895751953125, |
|
"learning_rate": 3.1696766879575354e-07, |
|
"loss": 0.0122, |
|
"reward": 1.0729888081550598, |
|
"reward_std": 0.41248803213238716, |
|
"rewards/accuracy_reward": 0.6071428954601288, |
|
"rewards/cosine_scaled_reward": 0.4658459797501564, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 1612.4732971191406, |
|
"epoch": 0.7081481481481482, |
|
"grad_norm": 0.3092443961041311, |
|
"kl": 0.189453125, |
|
"learning_rate": 3.1298886463335857e-07, |
|
"loss": 0.0405, |
|
"reward": 1.1404339224100113, |
|
"reward_std": 0.49810411036014557, |
|
"rewards/accuracy_reward": 0.6383928805589676, |
|
"rewards/cosine_scaled_reward": 0.5020410493016243, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 1501.8438110351562, |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.42545118282377176, |
|
"kl": 0.20068359375, |
|
"learning_rate": 3.090355392931827e-07, |
|
"loss": 0.1038, |
|
"reward": 1.1629545539617538, |
|
"reward_std": 0.49815448373556137, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/cosine_scaled_reward": 0.5022402182221413, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1084.200942993164, |
|
"epoch": 0.7140740740740741, |
|
"grad_norm": 0.5829087708630915, |
|
"kl": 0.287109375, |
|
"learning_rate": 3.051081177597876e-07, |
|
"loss": 0.0288, |
|
"reward": 1.4094779789447784, |
|
"reward_std": 0.4218045175075531, |
|
"rewards/accuracy_reward": 0.7633928954601288, |
|
"rewards/cosine_scaled_reward": 0.6460850834846497, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1364.0044860839844, |
|
"epoch": 0.717037037037037, |
|
"grad_norm": 0.4157551911374349, |
|
"kl": 0.239990234375, |
|
"learning_rate": 3.012070222330629e-07, |
|
"loss": 0.0491, |
|
"reward": 1.2944897413253784, |
|
"reward_std": 0.4317842833697796, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/cosine_scaled_reward": 0.5891326069831848, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 1542.388427734375, |
|
"epoch": 0.72, |
|
"grad_norm": 0.5376555808419441, |
|
"kl": 0.261962890625, |
|
"learning_rate": 2.97332672082837e-07, |
|
"loss": 0.0058, |
|
"reward": 1.268993079662323, |
|
"reward_std": 0.5607812628149986, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/cosine_scaled_reward": 0.5725645199418068, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 1342.5402221679688, |
|
"epoch": 0.7229629629629629, |
|
"grad_norm": 0.4074657442549319, |
|
"kl": 0.228759765625, |
|
"learning_rate": 2.934854838037978e-07, |
|
"loss": -0.0441, |
|
"reward": 1.339747965335846, |
|
"reward_std": 0.49331291019916534, |
|
"rewards/accuracy_reward": 0.714285746216774, |
|
"rewards/cosine_scaled_reward": 0.625462144613266, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1365.7366638183594, |
|
"epoch": 0.725925925925926, |
|
"grad_norm": 0.40832180400823287, |
|
"kl": 0.24951171875, |
|
"learning_rate": 2.8966587097071683e-07, |
|
"loss": -0.0179, |
|
"reward": 1.3763412535190582, |
|
"reward_std": 0.45648277550935745, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/cosine_scaled_reward": 0.617412656545639, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 876.0937957763672, |
|
"epoch": 0.7288888888888889, |
|
"grad_norm": 0.6321515347107881, |
|
"kl": 0.3095703125, |
|
"learning_rate": 2.8587424419399055e-07, |
|
"loss": 0.0573, |
|
"reward": 1.5959438979625702, |
|
"reward_std": 0.44527300633490086, |
|
"rewards/accuracy_reward": 0.8437500149011612, |
|
"rewards/cosine_scaled_reward": 0.7521938383579254, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 1092.5714950561523, |
|
"epoch": 0.7318518518518519, |
|
"grad_norm": 0.4067546608657204, |
|
"kl": 0.218017578125, |
|
"learning_rate": 2.821110110755004e-07, |
|
"loss": -0.0326, |
|
"reward": 1.2758931815624237, |
|
"reward_std": 0.5539436712861061, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.5660717189311981, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 829.7857513427734, |
|
"epoch": 0.7348148148148148, |
|
"grad_norm": 0.5217243910881588, |
|
"kl": 0.3056640625, |
|
"learning_rate": 2.783765761647934e-07, |
|
"loss": 0.0326, |
|
"reward": 1.2323424369096756, |
|
"reward_std": 0.5618766322731972, |
|
"rewards/accuracy_reward": 0.6607143133878708, |
|
"rewards/cosine_scaled_reward": 0.5716281086206436, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1365.2009735107422, |
|
"epoch": 0.7377777777777778, |
|
"grad_norm": 0.4853542957910564, |
|
"kl": 0.29052734375, |
|
"learning_rate": 2.746713409155951e-07, |
|
"loss": 0.023, |
|
"reward": 1.2717522531747818, |
|
"reward_std": 0.581157274544239, |
|
"rewards/accuracy_reward": 0.6964286118745804, |
|
"rewards/cosine_scaled_reward": 0.575323686003685, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1353.825942993164, |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.5163739772137682, |
|
"kl": 0.26995849609375, |
|
"learning_rate": 2.709957036426512e-07, |
|
"loss": 0.0271, |
|
"reward": 1.3092380166053772, |
|
"reward_std": 0.5593340247869492, |
|
"rewards/accuracy_reward": 0.7053571790456772, |
|
"rewards/cosine_scaled_reward": 0.6038808077573776, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 1508.0982971191406, |
|
"epoch": 0.7437037037037038, |
|
"grad_norm": 0.4173902404577724, |
|
"kl": 0.208984375, |
|
"learning_rate": 2.6735005947890986e-07, |
|
"loss": 0.0223, |
|
"reward": 1.2288760542869568, |
|
"reward_std": 0.7006416544318199, |
|
"rewards/accuracy_reward": 0.6875000298023224, |
|
"rewards/cosine_scaled_reward": 0.5413760542869568, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1491.982177734375, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.41265414833354097, |
|
"kl": 0.277099609375, |
|
"learning_rate": 2.6373480033304397e-07, |
|
"loss": -0.0232, |
|
"reward": 1.0731790214776993, |
|
"reward_std": 0.5543450340628624, |
|
"rewards/accuracy_reward": 0.5982142984867096, |
|
"rewards/cosine_scaled_reward": 0.4749646857380867, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 1239.7545623779297, |
|
"epoch": 0.7496296296296296, |
|
"grad_norm": 0.4917280354457927, |
|
"kl": 0.2900390625, |
|
"learning_rate": 2.6015031484732103e-07, |
|
"loss": 0.0201, |
|
"reward": 1.260190635919571, |
|
"reward_std": 0.5851811021566391, |
|
"rewards/accuracy_reward": 0.6875000298023224, |
|
"rewards/cosine_scaled_reward": 0.5726906284689903, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 1361.5625915527344, |
|
"epoch": 0.7525925925925926, |
|
"grad_norm": 0.3469414349194081, |
|
"kl": 0.2412109375, |
|
"learning_rate": 2.565969883558236e-07, |
|
"loss": -0.0921, |
|
"reward": 1.24320450425148, |
|
"reward_std": 0.47326986491680145, |
|
"rewards/accuracy_reward": 0.6830357611179352, |
|
"rewards/cosine_scaled_reward": 0.5601687207818031, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 1658.2188110351562, |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 0.3001068094750129, |
|
"kl": 0.2127685546875, |
|
"learning_rate": 2.5307520284302606e-07, |
|
"loss": 0.0615, |
|
"reward": 1.1176211386919022, |
|
"reward_std": 0.586233526468277, |
|
"rewards/accuracy_reward": 0.6383928805589676, |
|
"rewards/cosine_scaled_reward": 0.47922827303409576, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 1364.2902526855469, |
|
"epoch": 0.7585185185185185, |
|
"grad_norm": 0.4237148024737694, |
|
"kl": 0.27880859375, |
|
"learning_rate": 2.495853369027309e-07, |
|
"loss": 0.0823, |
|
"reward": 1.1211449354887009, |
|
"reward_std": 0.6094193160533905, |
|
"rewards/accuracy_reward": 0.6205357313156128, |
|
"rewards/cosine_scaled_reward": 0.5006091818213463, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 1790.7724609375, |
|
"epoch": 0.7614814814814815, |
|
"grad_norm": 0.259378407584805, |
|
"kl": 0.181396484375, |
|
"learning_rate": 2.4612776569736984e-07, |
|
"loss": 0.0149, |
|
"reward": 1.3074856102466583, |
|
"reward_std": 0.68328557908535, |
|
"rewards/accuracy_reward": 0.7276786118745804, |
|
"rewards/cosine_scaled_reward": 0.5798069983720779, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 1608.384017944336, |
|
"epoch": 0.7644444444444445, |
|
"grad_norm": 0.453218761617686, |
|
"kl": 0.230712890625, |
|
"learning_rate": 2.4270286091767335e-07, |
|
"loss": 0.1279, |
|
"reward": 1.222515344619751, |
|
"reward_std": 0.3630467727780342, |
|
"rewards/accuracy_reward": 0.683035746216774, |
|
"rewards/cosine_scaled_reward": 0.539479598402977, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 685.4464645385742, |
|
"epoch": 0.7674074074074074, |
|
"grad_norm": 0.578456215741116, |
|
"kl": 0.3359375, |
|
"learning_rate": 2.39310990742714e-07, |
|
"loss": 0.1645, |
|
"reward": 1.4390722215175629, |
|
"reward_std": 0.5096501708030701, |
|
"rewards/accuracy_reward": 0.7678571939468384, |
|
"rewards/cosine_scaled_reward": 0.6712149977684021, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 1224.075942993164, |
|
"epoch": 0.7703703703703704, |
|
"grad_norm": 0.400946501548652, |
|
"kl": 0.25, |
|
"learning_rate": 2.3595251980032673e-07, |
|
"loss": 0.0252, |
|
"reward": 1.4241975545883179, |
|
"reward_std": 0.5224835053086281, |
|
"rewards/accuracy_reward": 0.767857164144516, |
|
"rewards/cosine_scaled_reward": 0.6563403755426407, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1514.6875610351562, |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 0.32107696314591083, |
|
"kl": 0.21142578125, |
|
"learning_rate": 2.3262780912791183e-07, |
|
"loss": -0.0844, |
|
"reward": 1.2489876449108124, |
|
"reward_std": 0.47115904837846756, |
|
"rewards/accuracy_reward": 0.683035746216774, |
|
"rewards/cosine_scaled_reward": 0.5659519508481026, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 1049.1741638183594, |
|
"epoch": 0.7762962962962963, |
|
"grad_norm": 0.4983381962623269, |
|
"kl": 0.283203125, |
|
"learning_rate": 2.2933721613362188e-07, |
|
"loss": -0.0656, |
|
"reward": 1.3514663726091385, |
|
"reward_std": 0.5631691515445709, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/cosine_scaled_reward": 0.6282520294189453, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 1280.808090209961, |
|
"epoch": 0.7792592592592592, |
|
"grad_norm": 0.5370494902972633, |
|
"kl": 0.259521484375, |
|
"learning_rate": 2.2608109455794197e-07, |
|
"loss": 0.0951, |
|
"reward": 1.3311371505260468, |
|
"reward_std": 0.3910303898155689, |
|
"rewards/accuracy_reward": 0.7232143133878708, |
|
"rewards/cosine_scaled_reward": 0.6079228222370148, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 1652.7813262939453, |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 0.5517506308898386, |
|
"kl": 0.242431640625, |
|
"learning_rate": 2.2285979443566093e-07, |
|
"loss": -0.04, |
|
"reward": 1.1543093919754028, |
|
"reward_std": 0.543508306145668, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/cosine_scaled_reward": 0.5203807801008224, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 1420.4197235107422, |
|
"epoch": 0.7851851851851852, |
|
"grad_norm": 0.44231688430689814, |
|
"kl": 0.17041015625, |
|
"learning_rate": 2.196736620582429e-07, |
|
"loss": -0.0778, |
|
"reward": 1.1269243955612183, |
|
"reward_std": 0.5865212008357048, |
|
"rewards/accuracy_reward": 0.6339286118745804, |
|
"rewards/cosine_scaled_reward": 0.49299580603837967, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 1418.6116333007812, |
|
"epoch": 0.7881481481481482, |
|
"grad_norm": 0.4078519749270961, |
|
"kl": 0.208984375, |
|
"learning_rate": 2.1652303993660146e-07, |
|
"loss": -0.0418, |
|
"reward": 1.27777498960495, |
|
"reward_std": 0.5090883374214172, |
|
"rewards/accuracy_reward": 0.7053571790456772, |
|
"rewards/cosine_scaled_reward": 0.5724178552627563, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 1807.9688720703125, |
|
"epoch": 0.7911111111111111, |
|
"grad_norm": 0.3612322112017449, |
|
"kl": 0.204345703125, |
|
"learning_rate": 2.1340826676427826e-07, |
|
"loss": 0.0524, |
|
"reward": 0.9595437347888947, |
|
"reward_std": 0.5931633710861206, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/cosine_scaled_reward": 0.38811516016721725, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 1627.9599304199219, |
|
"epoch": 0.794074074074074, |
|
"grad_norm": 0.46615314939080527, |
|
"kl": 0.2088623046875, |
|
"learning_rate": 2.103296773810344e-07, |
|
"loss": -0.0877, |
|
"reward": 1.2892495691776276, |
|
"reward_std": 0.5664657056331635, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.5794281512498856, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 999.0357666015625, |
|
"epoch": 0.797037037037037, |
|
"grad_norm": 0.4273528196130989, |
|
"kl": 0.245361328125, |
|
"learning_rate": 2.0728760273685435e-07, |
|
"loss": -0.0457, |
|
"reward": 1.2847952246665955, |
|
"reward_std": 0.5983417630195618, |
|
"rewards/accuracy_reward": 0.6919643133878708, |
|
"rewards/cosine_scaled_reward": 0.5928309559822083, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 1342.5179290771484, |
|
"epoch": 0.8, |
|
"grad_norm": 0.5157694271512652, |
|
"kl": 0.23779296875, |
|
"learning_rate": 2.0428236985636878e-07, |
|
"loss": -0.0943, |
|
"reward": 1.285570204257965, |
|
"reward_std": 0.6152675747871399, |
|
"rewards/accuracy_reward": 0.7008928805589676, |
|
"rewards/cosine_scaled_reward": 0.5846773013472557, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1232.8393859863281, |
|
"epoch": 0.802962962962963, |
|
"grad_norm": 0.4444373741140913, |
|
"kl": 0.234375, |
|
"learning_rate": 2.0131430180369957e-07, |
|
"loss": -0.0321, |
|
"reward": 1.4194039404392242, |
|
"reward_std": 0.510127916932106, |
|
"rewards/accuracy_reward": 0.7633928805589676, |
|
"rewards/cosine_scaled_reward": 0.6560111045837402, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 1367.5134582519531, |
|
"epoch": 0.8059259259259259, |
|
"grad_norm": 0.4184360736447391, |
|
"kl": 0.247802734375, |
|
"learning_rate": 1.9838371764772992e-07, |
|
"loss": 0.0103, |
|
"reward": 1.1604254990816116, |
|
"reward_std": 0.6053372994065285, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/cosine_scaled_reward": 0.5264968723058701, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 1476.8304290771484, |
|
"epoch": 0.8088888888888889, |
|
"grad_norm": 0.5070722115121915, |
|
"kl": 0.25146484375, |
|
"learning_rate": 1.954909324278041e-07, |
|
"loss": 0.0343, |
|
"reward": 1.1654971539974213, |
|
"reward_std": 0.5020733252167702, |
|
"rewards/accuracy_reward": 0.651785746216774, |
|
"rewards/cosine_scaled_reward": 0.5137114599347115, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 1203.1920166015625, |
|
"epoch": 0.8118518518518518, |
|
"grad_norm": 0.4609308538909395, |
|
"kl": 0.251220703125, |
|
"learning_rate": 1.9263625711986092e-07, |
|
"loss": 0.0711, |
|
"reward": 1.279031217098236, |
|
"reward_std": 0.5459231436252594, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/cosine_scaled_reward": 0.5826026350259781, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 1180.1205749511719, |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 1.5003547281483087, |
|
"kl": 0.27490234375, |
|
"learning_rate": 1.8981999860300385e-07, |
|
"loss": 0.1132, |
|
"reward": 1.3819984197616577, |
|
"reward_std": 0.5490370243787766, |
|
"rewards/accuracy_reward": 0.7500000298023224, |
|
"rewards/cosine_scaled_reward": 0.6319983601570129, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 1468.6295471191406, |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 0.3926677811915502, |
|
"kl": 0.24267578125, |
|
"learning_rate": 1.8704245962651026e-07, |
|
"loss": 0.049, |
|
"reward": 1.3316981196403503, |
|
"reward_std": 0.5511728748679161, |
|
"rewards/accuracy_reward": 0.7232142984867096, |
|
"rewards/cosine_scaled_reward": 0.608483761548996, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1194.0089721679688, |
|
"epoch": 0.8207407407407408, |
|
"grad_norm": 0.5852846616386558, |
|
"kl": 0.3037109375, |
|
"learning_rate": 1.8430393877728745e-07, |
|
"loss": -0.079, |
|
"reward": 1.3836183547973633, |
|
"reward_std": 0.5606663823127747, |
|
"rewards/accuracy_reward": 0.7500000298023224, |
|
"rewards/cosine_scaled_reward": 0.6336182802915573, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 1510.401870727539, |
|
"epoch": 0.8237037037037037, |
|
"grad_norm": 0.3342562243587002, |
|
"kl": 0.197509765625, |
|
"learning_rate": 1.8160473044777263e-07, |
|
"loss": 0.1086, |
|
"reward": 1.1657965332269669, |
|
"reward_std": 0.5177839547395706, |
|
"rewards/accuracy_reward": 0.651785746216774, |
|
"rewards/cosine_scaled_reward": 0.5140108019113541, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1184.7054138183594, |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 0.45102568845817925, |
|
"kl": 0.261474609375, |
|
"learning_rate": 1.789451248042867e-07, |
|
"loss": -0.0701, |
|
"reward": 1.5106743574142456, |
|
"reward_std": 0.45990853384137154, |
|
"rewards/accuracy_reward": 0.8035714626312256, |
|
"rewards/cosine_scaled_reward": 0.7071028649806976, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 1492.3527526855469, |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 0.37307331027960394, |
|
"kl": 0.17333984375, |
|
"learning_rate": 1.763254077558411e-07, |
|
"loss": 0.0009, |
|
"reward": 1.390456646680832, |
|
"reward_std": 0.5531467348337173, |
|
"rewards/accuracy_reward": 0.7589286118745804, |
|
"rewards/cosine_scaled_reward": 0.6315280720591545, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 773.513427734375, |
|
"epoch": 0.8325925925925926, |
|
"grad_norm": 0.5361053249805026, |
|
"kl": 0.29345703125, |
|
"learning_rate": 1.7374586092340194e-07, |
|
"loss": 0.0238, |
|
"reward": 1.5248645544052124, |
|
"reward_std": 0.502290703356266, |
|
"rewards/accuracy_reward": 0.8080357760190964, |
|
"rewards/cosine_scaled_reward": 0.7168288230895996, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 1233.0045318603516, |
|
"epoch": 0.8355555555555556, |
|
"grad_norm": 0.4173056786421646, |
|
"kl": 0.243408203125, |
|
"learning_rate": 1.712067616096159e-07, |
|
"loss": -0.037, |
|
"reward": 1.3660497963428497, |
|
"reward_std": 0.6344530582427979, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/cosine_scaled_reward": 0.6249783635139465, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 1620.200927734375, |
|
"epoch": 0.8385185185185186, |
|
"grad_norm": 0.39016732045765984, |
|
"kl": 0.1807861328125, |
|
"learning_rate": 1.6870838276900018e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1307050585746765, |
|
"reward_std": 0.5222566425800323, |
|
"rewards/accuracy_reward": 0.6428571790456772, |
|
"rewards/cosine_scaled_reward": 0.4878478869795799, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 1542.8348693847656, |
|
"epoch": 0.8414814814814815, |
|
"grad_norm": 0.4725307883078238, |
|
"kl": 0.22265625, |
|
"learning_rate": 1.6625099297859945e-07, |
|
"loss": 0.0879, |
|
"reward": 1.2903397679328918, |
|
"reward_std": 0.5241145640611649, |
|
"rewards/accuracy_reward": 0.714285746216774, |
|
"rewards/cosine_scaled_reward": 0.5760539919137955, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 1025.8527221679688, |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 0.5196454376003679, |
|
"kl": 0.26513671875, |
|
"learning_rate": 1.638348564091142e-07, |
|
"loss": 0.1132, |
|
"reward": 1.255773812532425, |
|
"reward_std": 0.6003080010414124, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/cosine_scaled_reward": 0.577202320098877, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 1298.3348846435547, |
|
"epoch": 0.8474074074074074, |
|
"grad_norm": 0.5413412582194498, |
|
"kl": 0.248291015625, |
|
"learning_rate": 1.6146023279650146e-07, |
|
"loss": -0.0199, |
|
"reward": 1.0494957864284515, |
|
"reward_std": 0.5134128257632256, |
|
"rewards/accuracy_reward": 0.5758928880095482, |
|
"rewards/cosine_scaled_reward": 0.47360285371541977, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 1183.8795013427734, |
|
"epoch": 0.8503703703703703, |
|
"grad_norm": 0.43187548636698553, |
|
"kl": 0.238037109375, |
|
"learning_rate": 1.5912737741405364e-07, |
|
"loss": 0.0592, |
|
"reward": 1.2715223133563995, |
|
"reward_std": 0.628858245909214, |
|
"rewards/accuracy_reward": 0.6830357313156128, |
|
"rewards/cosine_scaled_reward": 0.5884865522384644, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 1753.2188415527344, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.4428533634005759, |
|
"kl": 0.190185546875, |
|
"learning_rate": 1.5683654104495627e-07, |
|
"loss": 0.0715, |
|
"reward": 1.211821123957634, |
|
"reward_std": 0.4482051581144333, |
|
"rewards/accuracy_reward": 0.6696428954601288, |
|
"rewards/cosine_scaled_reward": 0.5421782657504082, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 1441.8572082519531, |
|
"epoch": 0.8562962962962963, |
|
"grad_norm": 0.4890440610981784, |
|
"kl": 0.2373046875, |
|
"learning_rate": 1.5458796995532915e-07, |
|
"loss": 0.0065, |
|
"reward": 1.5017207860946655, |
|
"reward_std": 0.47509852796792984, |
|
"rewards/accuracy_reward": 0.816964328289032, |
|
"rewards/cosine_scaled_reward": 0.6847565025091171, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 1205.3304290771484, |
|
"epoch": 0.8592592592592593, |
|
"grad_norm": 0.47871337934810404, |
|
"kl": 0.24365234375, |
|
"learning_rate": 1.5238190586775145e-07, |
|
"loss": 0.0997, |
|
"reward": 1.318919599056244, |
|
"reward_std": 0.5144237354397774, |
|
"rewards/accuracy_reward": 0.7232143133878708, |
|
"rewards/cosine_scaled_reward": 0.5957053601741791, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 1630.5804138183594, |
|
"epoch": 0.8622222222222222, |
|
"grad_norm": 0.32041894818924843, |
|
"kl": 0.1953125, |
|
"learning_rate": 1.50218585935278e-07, |
|
"loss": -0.038, |
|
"reward": 1.3746657818555832, |
|
"reward_std": 0.4751938730478287, |
|
"rewards/accuracy_reward": 0.754464328289032, |
|
"rewards/cosine_scaled_reward": 0.6202014237642288, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 930.7277297973633, |
|
"epoch": 0.8651851851851852, |
|
"grad_norm": 0.474044463723305, |
|
"kl": 0.260986328125, |
|
"learning_rate": 1.4809824271594384e-07, |
|
"loss": 0.0273, |
|
"reward": 1.3926972150802612, |
|
"reward_std": 0.4362456612288952, |
|
"rewards/accuracy_reward": 0.7500000298023224, |
|
"rewards/cosine_scaled_reward": 0.6426971927285194, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 1605.7902221679688, |
|
"epoch": 0.8681481481481481, |
|
"grad_norm": 0.3263847223884831, |
|
"kl": 0.175048828125, |
|
"learning_rate": 1.4602110414776475e-07, |
|
"loss": -0.0205, |
|
"reward": 1.2047614008188248, |
|
"reward_std": 0.554048590362072, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/cosine_scaled_reward": 0.5083328485488892, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 1610.9375915527344, |
|
"epoch": 0.8711111111111111, |
|
"grad_norm": 0.3401666470383707, |
|
"kl": 0.1712646484375, |
|
"learning_rate": 1.4398739352423406e-07, |
|
"loss": 0.0331, |
|
"reward": 1.011045515537262, |
|
"reward_std": 0.5167748332023621, |
|
"rewards/accuracy_reward": 0.5937500149011612, |
|
"rewards/cosine_scaled_reward": 0.41729553043842316, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 1926.4420471191406, |
|
"epoch": 0.8740740740740741, |
|
"grad_norm": 0.45709540487996353, |
|
"kl": 0.1748046875, |
|
"learning_rate": 1.419973294703174e-07, |
|
"loss": 0.0703, |
|
"reward": 1.0281963050365448, |
|
"reward_std": 0.47293490171432495, |
|
"rewards/accuracy_reward": 0.6026785895228386, |
|
"rewards/cosine_scaled_reward": 0.42551764845848083, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 1782.7411499023438, |
|
"epoch": 0.8770370370370371, |
|
"grad_norm": 0.33218294380687946, |
|
"kl": 0.2039794921875, |
|
"learning_rate": 1.400511259189518e-07, |
|
"loss": 0.0501, |
|
"reward": 1.1762474179267883, |
|
"reward_std": 0.37930237501859665, |
|
"rewards/accuracy_reward": 0.6696428954601288, |
|
"rewards/cosine_scaled_reward": 0.5066045522689819, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 1053.7679023742676, |
|
"epoch": 0.88, |
|
"grad_norm": 0.41588133172026326, |
|
"kl": 0.24658203125, |
|
"learning_rate": 1.3814899208804677e-07, |
|
"loss": 0.0503, |
|
"reward": 1.2276224493980408, |
|
"reward_std": 0.5703399553894997, |
|
"rewards/accuracy_reward": 0.6785714477300644, |
|
"rewards/cosine_scaled_reward": 0.5490510165691376, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 1754.9733276367188, |
|
"epoch": 0.882962962962963, |
|
"grad_norm": 0.4886881244904496, |
|
"kl": 0.2197265625, |
|
"learning_rate": 1.3629113245799361e-07, |
|
"loss": -0.0222, |
|
"reward": 1.1690296977758408, |
|
"reward_std": 0.5029887109994888, |
|
"rewards/accuracy_reward": 0.6562500298023224, |
|
"rewards/cosine_scaled_reward": 0.5127796456217766, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 1098.4911346435547, |
|
"epoch": 0.8859259259259259, |
|
"grad_norm": 0.4020920520908739, |
|
"kl": 0.2568359375, |
|
"learning_rate": 1.3447774674968387e-07, |
|
"loss": -0.0071, |
|
"reward": 1.3663478195667267, |
|
"reward_std": 0.6112166717648506, |
|
"rewards/accuracy_reward": 0.7276786118745804, |
|
"rewards/cosine_scaled_reward": 0.6386693120002747, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 1364.7634887695312, |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.6605478512720968, |
|
"kl": 0.256103515625, |
|
"learning_rate": 1.3270902990303869e-07, |
|
"loss": 0.1078, |
|
"reward": 1.3349690437316895, |
|
"reward_std": 0.43663863837718964, |
|
"rewards/accuracy_reward": 0.7321428954601288, |
|
"rewards/cosine_scaled_reward": 0.6028260812163353, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 989.0045013427734, |
|
"epoch": 0.8918518518518519, |
|
"grad_norm": 0.5497470505531581, |
|
"kl": 0.297119140625, |
|
"learning_rate": 1.3098517205605325e-07, |
|
"loss": -0.0579, |
|
"reward": 1.1443769484758377, |
|
"reward_std": 0.5397379323840141, |
|
"rewards/accuracy_reward": 0.6250000149011612, |
|
"rewards/cosine_scaled_reward": 0.5193769186735153, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 1223.0357666015625, |
|
"epoch": 0.8948148148148148, |
|
"grad_norm": 0.44189728480389967, |
|
"kl": 0.241943359375, |
|
"learning_rate": 1.2930635852435634e-07, |
|
"loss": -0.0605, |
|
"reward": 1.2372848689556122, |
|
"reward_std": 0.6824733465909958, |
|
"rewards/accuracy_reward": 0.674107164144516, |
|
"rewards/cosine_scaled_reward": 0.563177689909935, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 1622.5179138183594, |
|
"epoch": 0.8977777777777778, |
|
"grad_norm": 0.3848582793128278, |
|
"kl": 0.2061767578125, |
|
"learning_rate": 1.276727697812894e-07, |
|
"loss": 0.0914, |
|
"reward": 1.2423473447561264, |
|
"reward_std": 0.647514745593071, |
|
"rewards/accuracy_reward": 0.6741071790456772, |
|
"rewards/cosine_scaled_reward": 0.5682401582598686, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 1614.9286193847656, |
|
"epoch": 0.9007407407407407, |
|
"grad_norm": 0.3007297849667304, |
|
"kl": 0.209716796875, |
|
"learning_rate": 1.2608458143850493e-07, |
|
"loss": 0.0022, |
|
"reward": 1.2776865363121033, |
|
"reward_std": 0.6378434896469116, |
|
"rewards/accuracy_reward": 0.7008928805589676, |
|
"rewards/cosine_scaled_reward": 0.5767936706542969, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 1398.571517944336, |
|
"epoch": 0.9037037037037037, |
|
"grad_norm": 0.4951854800661223, |
|
"kl": 0.2578125, |
|
"learning_rate": 1.2454196422708843e-07, |
|
"loss": 0.0546, |
|
"reward": 0.9712510854005814, |
|
"reward_std": 0.5841480642557144, |
|
"rewards/accuracy_reward": 0.5625000223517418, |
|
"rewards/cosine_scaled_reward": 0.40875105932354927, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 1390.5759353637695, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.5060958415409001, |
|
"kl": 0.2391357421875, |
|
"learning_rate": 1.2304508397920499e-07, |
|
"loss": 0.0294, |
|
"reward": 1.0037438869476318, |
|
"reward_std": 0.6964580416679382, |
|
"rewards/accuracy_reward": 0.549107164144516, |
|
"rewards/cosine_scaled_reward": 0.45463668555021286, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 1332.0715103149414, |
|
"epoch": 0.9096296296296297, |
|
"grad_norm": 0.40475083679377444, |
|
"kl": 0.251708984375, |
|
"learning_rate": 1.2159410161027153e-07, |
|
"loss": 0.0844, |
|
"reward": 1.3975183367729187, |
|
"reward_std": 0.5006646141409874, |
|
"rewards/accuracy_reward": 0.754464328289032, |
|
"rewards/cosine_scaled_reward": 0.6430540382862091, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 1306.8259582519531, |
|
"epoch": 0.9125925925925926, |
|
"grad_norm": 0.33395393560033776, |
|
"kl": 0.218505859375, |
|
"learning_rate": 1.2018917310165926e-07, |
|
"loss": 0.016, |
|
"reward": 1.2559349834918976, |
|
"reward_std": 0.5287806503474712, |
|
"rewards/accuracy_reward": 0.7008928954601288, |
|
"rewards/cosine_scaled_reward": 0.5550421252846718, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 1175.1116333007812, |
|
"epoch": 0.9155555555555556, |
|
"grad_norm": 0.45684265652865813, |
|
"kl": 0.2197265625, |
|
"learning_rate": 1.1883044948392453e-07, |
|
"loss": 0.1229, |
|
"reward": 1.2261989116668701, |
|
"reward_std": 0.44794493168592453, |
|
"rewards/accuracy_reward": 0.6875000298023224, |
|
"rewards/cosine_scaled_reward": 0.5386988818645477, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 1192.4420471191406, |
|
"epoch": 0.9185185185185185, |
|
"grad_norm": 0.5819087268590017, |
|
"kl": 0.321533203125, |
|
"learning_rate": 1.1751807682057396e-07, |
|
"loss": -0.0148, |
|
"reward": 1.4763469398021698, |
|
"reward_std": 0.473043292760849, |
|
"rewards/accuracy_reward": 0.7812500447034836, |
|
"rewards/cosine_scaled_reward": 0.6950969099998474, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 1254.6205978393555, |
|
"epoch": 0.9214814814814815, |
|
"grad_norm": 0.5149829152884512, |
|
"kl": 0.271240234375, |
|
"learning_rate": 1.1625219619236196e-07, |
|
"loss": -0.0156, |
|
"reward": 1.2878541946411133, |
|
"reward_std": 0.5938218757510185, |
|
"rewards/accuracy_reward": 0.7053571790456772, |
|
"rewards/cosine_scaled_reward": 0.5824970304965973, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 1246.2545013427734, |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 0.5342345338890936, |
|
"kl": 0.28076171875, |
|
"learning_rate": 1.1503294368212441e-07, |
|
"loss": -0.0491, |
|
"reward": 1.23801089823246, |
|
"reward_std": 0.48125500977039337, |
|
"rewards/accuracy_reward": 0.6651786118745804, |
|
"rewards/cosine_scaled_reward": 0.5728322416543961, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 1623.9777221679688, |
|
"epoch": 0.9274074074074075, |
|
"grad_norm": 0.292124358723905, |
|
"kl": 0.16351318359375, |
|
"learning_rate": 1.1386045036015024e-07, |
|
"loss": -0.011, |
|
"reward": 1.1545456051826477, |
|
"reward_std": 0.5948602706193924, |
|
"rewards/accuracy_reward": 0.6607143133878708, |
|
"rewards/cosine_scaled_reward": 0.4938312843441963, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 1209.8080596923828, |
|
"epoch": 0.9303703703703704, |
|
"grad_norm": 0.889231001318171, |
|
"kl": 0.288818359375, |
|
"learning_rate": 1.1273484227009072e-07, |
|
"loss": 0.0213, |
|
"reward": 1.2507951855659485, |
|
"reward_std": 0.6353217959403992, |
|
"rewards/accuracy_reward": 0.6785714477300644, |
|
"rewards/cosine_scaled_reward": 0.5722237303853035, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 1585.1250915527344, |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.4536490161605513, |
|
"kl": 0.222412109375, |
|
"learning_rate": 1.116562404154099e-07, |
|
"loss": 0.0189, |
|
"reward": 1.1351844668388367, |
|
"reward_std": 0.535600557923317, |
|
"rewards/accuracy_reward": 0.6383928656578064, |
|
"rewards/cosine_scaled_reward": 0.4967915639281273, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 871.2946929931641, |
|
"epoch": 0.9362962962962963, |
|
"grad_norm": 0.596048822760235, |
|
"kl": 0.28515625, |
|
"learning_rate": 1.1062476074637685e-07, |
|
"loss": 0.1252, |
|
"reward": 1.359487771987915, |
|
"reward_std": 0.4618111401796341, |
|
"rewards/accuracy_reward": 0.7187500447034836, |
|
"rewards/cosine_scaled_reward": 0.6407377123832703, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 1378.9553985595703, |
|
"epoch": 0.9392592592592592, |
|
"grad_norm": 0.4592747700126037, |
|
"kl": 0.260986328125, |
|
"learning_rate": 1.0964051414760065e-07, |
|
"loss": 0.0193, |
|
"reward": 1.323800265789032, |
|
"reward_std": 0.47177664190530777, |
|
"rewards/accuracy_reward": 0.7187500447034836, |
|
"rewards/cosine_scaled_reward": 0.6050502583384514, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 858.9018096923828, |
|
"epoch": 0.9422222222222222, |
|
"grad_norm": 0.6123065766410447, |
|
"kl": 0.33349609375, |
|
"learning_rate": 1.087036064261106e-07, |
|
"loss": 0.0145, |
|
"reward": 1.4594223201274872, |
|
"reward_std": 0.5125085860490799, |
|
"rewards/accuracy_reward": 0.7633928954601288, |
|
"rewards/cosine_scaled_reward": 0.696029394865036, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 1336.3348846435547, |
|
"epoch": 0.9451851851851852, |
|
"grad_norm": 0.39416387811795184, |
|
"kl": 0.247314453125, |
|
"learning_rate": 1.0781413829998135e-07, |
|
"loss": -0.0303, |
|
"reward": 1.2021480649709702, |
|
"reward_std": 0.5442958772182465, |
|
"rewards/accuracy_reward": 0.6696428805589676, |
|
"rewards/cosine_scaled_reward": 0.5325051471590996, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 1438.3973999023438, |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 0.43467725744059227, |
|
"kl": 0.255859375, |
|
"learning_rate": 1.0697220538750631e-07, |
|
"loss": 0.0463, |
|
"reward": 1.1738777160644531, |
|
"reward_std": 0.6729736477136612, |
|
"rewards/accuracy_reward": 0.6428571790456772, |
|
"rewards/cosine_scaled_reward": 0.5310205593705177, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 1484.1072387695312, |
|
"epoch": 0.9511111111111111, |
|
"grad_norm": 0.45452453298221474, |
|
"kl": 0.2275390625, |
|
"learning_rate": 1.0617789819691819e-07, |
|
"loss": -0.0514, |
|
"reward": 1.105325609445572, |
|
"reward_std": 0.6967962235212326, |
|
"rewards/accuracy_reward": 0.6205357387661934, |
|
"rewards/cosine_scaled_reward": 0.48478981852531433, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 1666.290283203125, |
|
"epoch": 0.9540740740740741, |
|
"grad_norm": 0.39839803599498697, |
|
"kl": 0.240966796875, |
|
"learning_rate": 1.054313021166595e-07, |
|
"loss": 0.0017, |
|
"reward": 1.1861306875944138, |
|
"reward_std": 0.6368418782949448, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/cosine_scaled_reward": 0.5254162922501564, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 1225.4197006225586, |
|
"epoch": 0.957037037037037, |
|
"grad_norm": 0.5218036130830189, |
|
"kl": 0.220703125, |
|
"learning_rate": 1.0473249740620304e-07, |
|
"loss": -0.0204, |
|
"reward": 1.1594546139240265, |
|
"reward_std": 0.6208428591489792, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/cosine_scaled_reward": 0.5255259871482849, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 1377.6339721679688, |
|
"epoch": 0.96, |
|
"grad_norm": 0.39857455474607256, |
|
"kl": 0.248291015625, |
|
"learning_rate": 1.0408155918742432e-07, |
|
"loss": -0.0069, |
|
"reward": 1.1889010518789291, |
|
"reward_std": 0.5303689762949944, |
|
"rewards/accuracy_reward": 0.6651786118745804, |
|
"rewards/cosine_scaled_reward": 0.5237224623560905, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 1925.6072387695312, |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 0.4234383833079227, |
|
"kl": 0.2197265625, |
|
"learning_rate": 1.034785574365256e-07, |
|
"loss": 0.0489, |
|
"reward": 1.1286714375019073, |
|
"reward_std": 0.564603678882122, |
|
"rewards/accuracy_reward": 0.6294643133878708, |
|
"rewards/cosine_scaled_reward": 0.4992070645093918, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 1323.1027221679688, |
|
"epoch": 0.965925925925926, |
|
"grad_norm": 0.4937948952732904, |
|
"kl": 0.270263671875, |
|
"learning_rate": 1.0292355697651348e-07, |
|
"loss": 0.0144, |
|
"reward": 0.9994739443063736, |
|
"reward_std": 0.641077071428299, |
|
"rewards/accuracy_reward": 0.5758928805589676, |
|
"rewards/cosine_scaled_reward": 0.42358100414276123, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 910.9375457763672, |
|
"epoch": 0.9688888888888889, |
|
"grad_norm": 0.5236547427670164, |
|
"kl": 0.32568359375, |
|
"learning_rate": 1.0241661747023064e-07, |
|
"loss": 0.0655, |
|
"reward": 1.1969931423664093, |
|
"reward_std": 0.5603185072541237, |
|
"rewards/accuracy_reward": 0.6473214477300644, |
|
"rewards/cosine_scaled_reward": 0.5496717244386673, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 1499.5313110351562, |
|
"epoch": 0.9718518518518519, |
|
"grad_norm": 0.5317429397319068, |
|
"kl": 0.257568359375, |
|
"learning_rate": 1.0195779341394164e-07, |
|
"loss": 0.129, |
|
"reward": 1.354402244091034, |
|
"reward_std": 0.4894205704331398, |
|
"rewards/accuracy_reward": 0.7321428805589676, |
|
"rewards/cosine_scaled_reward": 0.6222593784332275, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 1111.9330749511719, |
|
"epoch": 0.9748148148148148, |
|
"grad_norm": 0.5147699965476735, |
|
"kl": 0.260498046875, |
|
"learning_rate": 1.0154713413147486e-07, |
|
"loss": 0.078, |
|
"reward": 1.2603758871555328, |
|
"reward_std": 0.47384266555309296, |
|
"rewards/accuracy_reward": 0.7098214626312256, |
|
"rewards/cosine_scaled_reward": 0.5505543872714043, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 1475.4911346435547, |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 0.42585894642650685, |
|
"kl": 0.23876953125, |
|
"learning_rate": 1.0118468376892005e-07, |
|
"loss": 0.0107, |
|
"reward": 1.1338723003864288, |
|
"reward_std": 0.4878518432378769, |
|
"rewards/accuracy_reward": 0.6250000298023224, |
|
"rewards/cosine_scaled_reward": 0.5088722705841064, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 1144.6652374267578, |
|
"epoch": 0.9807407407407407, |
|
"grad_norm": 0.5024574257196305, |
|
"kl": 0.282470703125, |
|
"learning_rate": 1.0087048128988256e-07, |
|
"loss": 0.0042, |
|
"reward": 1.372282713651657, |
|
"reward_std": 0.4418197050690651, |
|
"rewards/accuracy_reward": 0.7410714477300644, |
|
"rewards/cosine_scaled_reward": 0.6312113404273987, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 1154.2723846435547, |
|
"epoch": 0.9837037037037037, |
|
"grad_norm": 0.5099595034924427, |
|
"kl": 0.30908203125, |
|
"learning_rate": 1.0060456047129485e-07, |
|
"loss": 0.0052, |
|
"reward": 1.4354938864707947, |
|
"reward_std": 0.5711122378706932, |
|
"rewards/accuracy_reward": 0.7633928954601288, |
|
"rewards/cosine_scaled_reward": 0.6721010059118271, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 1347.49560546875, |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 0.41094105741848364, |
|
"kl": 0.214111328125, |
|
"learning_rate": 1.0038694989978531e-07, |
|
"loss": -0.0321, |
|
"reward": 1.2599745690822601, |
|
"reward_std": 0.5518456846475601, |
|
"rewards/accuracy_reward": 0.6919643133878708, |
|
"rewards/cosine_scaled_reward": 0.5680102482438087, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 1579.7054138183594, |
|
"epoch": 0.9896296296296296, |
|
"grad_norm": 0.3982623902115583, |
|
"kl": 0.1856689453125, |
|
"learning_rate": 1.0021767296860537e-07, |
|
"loss": 0.1626, |
|
"reward": 1.2055756747722626, |
|
"reward_std": 0.5490602627396584, |
|
"rewards/accuracy_reward": 0.6696428805589676, |
|
"rewards/cosine_scaled_reward": 0.535932794213295, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 1537.3438110351562, |
|
"epoch": 0.9925925925925926, |
|
"grad_norm": 0.43671825802827985, |
|
"kl": 0.238037109375, |
|
"learning_rate": 1.0009674787511447e-07, |
|
"loss": -0.0007, |
|
"reward": 1.2209820598363876, |
|
"reward_std": 0.5103197321295738, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/cosine_scaled_reward": 0.542410634458065, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 1196.111686706543, |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 0.6107691362588656, |
|
"kl": 0.291259765625, |
|
"learning_rate": 1.0002418761882409e-07, |
|
"loss": 0.0202, |
|
"reward": 1.3667995631694794, |
|
"reward_std": 0.44663529843091965, |
|
"rewards/accuracy_reward": 0.7366071790456772, |
|
"rewards/cosine_scaled_reward": 0.6301924362778664, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 1011.8393211364746, |
|
"epoch": 0.9985185185185185, |
|
"grad_norm": 0.7265395322809004, |
|
"kl": 0.2783203125, |
|
"learning_rate": 1e-07, |
|
"loss": 0.1413, |
|
"reward": 1.3486962914466858, |
|
"reward_std": 0.43993912637233734, |
|
"rewards/accuracy_reward": 0.7276785969734192, |
|
"rewards/cosine_scaled_reward": 0.621017687022686, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9985185185185185, |
|
"step": 337, |
|
"total_flos": 0.0, |
|
"train_loss": 0.028963628810559583, |
|
"train_runtime": 59585.1579, |
|
"train_samples_per_second": 0.181, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 337, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|