DeepSeek-R1-Distill-Qwen-7B-GRPO / trainer_state.json
Kadins's picture
Model save
a62814b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985185185185185,
"eval_steps": 500,
"global_step": 337,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1801.2277526855469,
"epoch": 0.002962962962962963,
"grad_norm": 0.15486460470342944,
"kl": 0.0,
"learning_rate": 2.941176470588235e-08,
"loss": 0.0214,
"reward": 1.2742209732532501,
"reward_std": 0.4368269592523575,
"rewards/accuracy_reward": 0.7901786118745804,
"rewards/cosine_scaled_reward": 0.48404236510396004,
"step": 1
},
{
"completion_length": 2185.982208251953,
"epoch": 0.005925925925925926,
"grad_norm": 0.13978450729681857,
"kl": 0.0,
"learning_rate": 5.88235294117647e-08,
"loss": -0.0103,
"reward": 0.9630088359117508,
"reward_std": 0.3557196706533432,
"rewards/accuracy_reward": 0.620535746216774,
"rewards/cosine_scaled_reward": 0.3424730747938156,
"step": 2
},
{
"completion_length": 2115.093780517578,
"epoch": 0.008888888888888889,
"grad_norm": 0.1838790154380984,
"kl": 0.00013649463653564453,
"learning_rate": 8.823529411764706e-08,
"loss": 0.1074,
"reward": 1.1710642874240875,
"reward_std": 0.40886973589658737,
"rewards/accuracy_reward": 0.705357164144516,
"rewards/cosine_scaled_reward": 0.46570710837841034,
"step": 3
},
{
"completion_length": 2096.4554138183594,
"epoch": 0.011851851851851851,
"grad_norm": 0.136462910218941,
"kl": 0.0001285076141357422,
"learning_rate": 1.176470588235294e-07,
"loss": -0.0027,
"reward": 1.3344760239124298,
"reward_std": 0.5786719098687172,
"rewards/accuracy_reward": 0.7857143431901932,
"rewards/cosine_scaled_reward": 0.5487616658210754,
"step": 4
},
{
"completion_length": 1957.7903137207031,
"epoch": 0.014814814814814815,
"grad_norm": 0.14830871529797945,
"kl": 0.00012803077697753906,
"learning_rate": 1.4705882352941175e-07,
"loss": -0.0675,
"reward": 1.1853420436382294,
"reward_std": 0.3382917121052742,
"rewards/accuracy_reward": 0.714285746216774,
"rewards/cosine_scaled_reward": 0.47105635702610016,
"step": 5
},
{
"completion_length": 1888.9197082519531,
"epoch": 0.017777777777777778,
"grad_norm": 0.12404996018310206,
"kl": 0.0001093149185180664,
"learning_rate": 1.764705882352941e-07,
"loss": 0.0027,
"reward": 1.42927685379982,
"reward_std": 0.31518058851361275,
"rewards/accuracy_reward": 0.8258928805589676,
"rewards/cosine_scaled_reward": 0.60338394343853,
"step": 6
},
{
"completion_length": 1843.6385192871094,
"epoch": 0.02074074074074074,
"grad_norm": 0.13872508211750081,
"kl": 0.00011134147644042969,
"learning_rate": 2.0588235294117645e-07,
"loss": 0.0631,
"reward": 1.3414935171604156,
"reward_std": 0.35266988538205624,
"rewards/accuracy_reward": 0.785714328289032,
"rewards/cosine_scaled_reward": 0.5557792335748672,
"step": 7
},
{
"completion_length": 2165.2545166015625,
"epoch": 0.023703703703703703,
"grad_norm": 0.13634517766953264,
"kl": 0.0001277923583984375,
"learning_rate": 2.352941176470588e-07,
"loss": -0.009,
"reward": 1.2295733392238617,
"reward_std": 0.4489167779684067,
"rewards/accuracy_reward": 0.7410714626312256,
"rewards/cosine_scaled_reward": 0.4885018467903137,
"step": 8
},
{
"completion_length": 2090.2500915527344,
"epoch": 0.02666666666666667,
"grad_norm": 0.13802999500843113,
"kl": 0.0001348257064819336,
"learning_rate": 2.6470588235294114e-07,
"loss": 0.0217,
"reward": 1.1188903898000717,
"reward_std": 0.3858538120985031,
"rewards/accuracy_reward": 0.7053571790456772,
"rewards/cosine_scaled_reward": 0.41353320330381393,
"step": 9
},
{
"completion_length": 1973.0134887695312,
"epoch": 0.02962962962962963,
"grad_norm": 0.14065638701142744,
"kl": 0.00011610984802246094,
"learning_rate": 2.941176470588235e-07,
"loss": 0.0352,
"reward": 1.126141995191574,
"reward_std": 0.4128708764910698,
"rewards/accuracy_reward": 0.6964286044239998,
"rewards/cosine_scaled_reward": 0.42971338517963886,
"step": 10
},
{
"completion_length": 2356.442108154297,
"epoch": 0.03259259259259259,
"grad_norm": 0.15235664392785112,
"kl": 0.00015854835510253906,
"learning_rate": 3.2352941176470586e-07,
"loss": 0.0137,
"reward": 1.133531704545021,
"reward_std": 0.36801043152809143,
"rewards/accuracy_reward": 0.7187500447034836,
"rewards/cosine_scaled_reward": 0.41478168219327927,
"step": 11
},
{
"completion_length": 1793.4420776367188,
"epoch": 0.035555555555555556,
"grad_norm": 0.15030605931327,
"kl": 9.799003601074219e-05,
"learning_rate": 3.529411764705882e-07,
"loss": 0.0135,
"reward": 1.1753974556922913,
"reward_std": 0.2614951431751251,
"rewards/accuracy_reward": 0.7187500149011612,
"rewards/cosine_scaled_reward": 0.4566473960876465,
"step": 12
},
{
"completion_length": 1617.4732971191406,
"epoch": 0.03851851851851852,
"grad_norm": 0.16439362332784327,
"kl": 9.578466415405273e-05,
"learning_rate": 3.8235294117647053e-07,
"loss": 0.0481,
"reward": 1.3459759652614594,
"reward_std": 0.19704193621873856,
"rewards/accuracy_reward": 0.848214328289032,
"rewards/cosine_scaled_reward": 0.49776165932416916,
"step": 13
},
{
"completion_length": 1364.5179443359375,
"epoch": 0.04148148148148148,
"grad_norm": 0.17539080474732854,
"kl": 9.167194366455078e-05,
"learning_rate": 4.117647058823529e-07,
"loss": -0.0146,
"reward": 1.5306860208511353,
"reward_std": 0.2709691859781742,
"rewards/accuracy_reward": 0.9196428954601288,
"rewards/cosine_scaled_reward": 0.6110431402921677,
"step": 14
},
{
"completion_length": 1916.4286193847656,
"epoch": 0.044444444444444446,
"grad_norm": 0.16486671210804701,
"kl": 0.0001081228256225586,
"learning_rate": 4.4117647058823526e-07,
"loss": 0.0016,
"reward": 1.2406930327415466,
"reward_std": 0.5602811053395271,
"rewards/accuracy_reward": 0.7723214626312256,
"rewards/cosine_scaled_reward": 0.46837157011032104,
"step": 15
},
{
"completion_length": 1566.6429443359375,
"epoch": 0.047407407407407405,
"grad_norm": 0.15688724371158902,
"kl": 9.28640365600586e-05,
"learning_rate": 4.705882352941176e-07,
"loss": 0.0135,
"reward": 1.4592451453208923,
"reward_std": 0.39220860973000526,
"rewards/accuracy_reward": 0.8526786118745804,
"rewards/cosine_scaled_reward": 0.606566533446312,
"step": 16
},
{
"completion_length": 1577.1116333007812,
"epoch": 0.05037037037037037,
"grad_norm": 0.18124476360697023,
"kl": 0.00010126829147338867,
"learning_rate": 5e-07,
"loss": 0.0808,
"reward": 1.2929720282554626,
"reward_std": 0.33900806307792664,
"rewards/accuracy_reward": 0.7901786118745804,
"rewards/cosine_scaled_reward": 0.5027934014797211,
"step": 17
},
{
"completion_length": 1941.6697387695312,
"epoch": 0.05333333333333334,
"grad_norm": 0.1480224883739112,
"kl": 0.00013065338134765625,
"learning_rate": 5.294117647058823e-07,
"loss": 0.0215,
"reward": 1.2064578533172607,
"reward_std": 0.5407338961958885,
"rewards/accuracy_reward": 0.7500000298023224,
"rewards/cosine_scaled_reward": 0.45645780116319656,
"step": 18
},
{
"completion_length": 2221.3616943359375,
"epoch": 0.056296296296296296,
"grad_norm": 0.18446288084506915,
"kl": 0.00013172626495361328,
"learning_rate": 5.588235294117647e-07,
"loss": 0.1226,
"reward": 1.3185656666755676,
"reward_std": 0.5382702872157097,
"rewards/accuracy_reward": 0.7857143133878708,
"rewards/cosine_scaled_reward": 0.5328513234853745,
"step": 19
},
{
"completion_length": 1600.8304748535156,
"epoch": 0.05925925925925926,
"grad_norm": 0.15136971669879618,
"kl": 8.463859558105469e-05,
"learning_rate": 5.88235294117647e-07,
"loss": 0.0009,
"reward": 1.3560706675052643,
"reward_std": 0.27356908470392227,
"rewards/accuracy_reward": 0.7946428954601288,
"rewards/cosine_scaled_reward": 0.5614277720451355,
"step": 20
},
{
"completion_length": 1814.1295471191406,
"epoch": 0.06222222222222222,
"grad_norm": 0.14862372479915026,
"kl": 0.00012290477752685547,
"learning_rate": 6.176470588235294e-07,
"loss": 0.0478,
"reward": 1.3167778551578522,
"reward_std": 0.351048968732357,
"rewards/accuracy_reward": 0.7901786118745804,
"rewards/cosine_scaled_reward": 0.5265992805361748,
"step": 21
},
{
"completion_length": 2074.214385986328,
"epoch": 0.06518518518518518,
"grad_norm": 0.16607453666951524,
"kl": 0.00013315677642822266,
"learning_rate": 6.470588235294117e-07,
"loss": 0.0513,
"reward": 1.0832807272672653,
"reward_std": 0.39553803764283657,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/cosine_scaled_reward": 0.40470923483371735,
"step": 22
},
{
"completion_length": 1610.8572387695312,
"epoch": 0.06814814814814815,
"grad_norm": 0.16726951041321547,
"kl": 9.208917617797852e-05,
"learning_rate": 6.764705882352941e-07,
"loss": 0.0374,
"reward": 1.421825647354126,
"reward_std": 0.3287600055336952,
"rewards/accuracy_reward": 0.8348214626312256,
"rewards/cosine_scaled_reward": 0.5870041102170944,
"step": 23
},
{
"completion_length": 1918.0357971191406,
"epoch": 0.07111111111111111,
"grad_norm": 0.17652478738451452,
"kl": 0.00012230873107910156,
"learning_rate": 7.058823529411765e-07,
"loss": 0.0782,
"reward": 1.2598042488098145,
"reward_std": 0.3701645387336612,
"rewards/accuracy_reward": 0.7410714626312256,
"rewards/cosine_scaled_reward": 0.5187327712774277,
"step": 24
},
{
"completion_length": 1735.2188720703125,
"epoch": 0.07407407407407407,
"grad_norm": 0.18159857154961948,
"kl": 0.00011110305786132812,
"learning_rate": 7.352941176470589e-07,
"loss": 0.1127,
"reward": 1.3935684263706207,
"reward_std": 0.3388653099536896,
"rewards/accuracy_reward": 0.8303571939468384,
"rewards/cosine_scaled_reward": 0.5632113069295883,
"step": 25
},
{
"completion_length": 1876.6116943359375,
"epoch": 0.07703703703703704,
"grad_norm": 0.14418288739115104,
"kl": 0.00010466575622558594,
"learning_rate": 7.647058823529411e-07,
"loss": 0.0367,
"reward": 1.2820298671722412,
"reward_std": 0.3634565807878971,
"rewards/accuracy_reward": 0.7812500298023224,
"rewards/cosine_scaled_reward": 0.5007798671722412,
"step": 26
},
{
"completion_length": 2139.2410888671875,
"epoch": 0.08,
"grad_norm": 0.1588485720271834,
"kl": 0.0001456737518310547,
"learning_rate": 7.941176470588235e-07,
"loss": 0.0693,
"reward": 1.257809430360794,
"reward_std": 0.4351053759455681,
"rewards/accuracy_reward": 0.7723214626312256,
"rewards/cosine_scaled_reward": 0.4854879528284073,
"step": 27
},
{
"completion_length": 1813.5581665039062,
"epoch": 0.08296296296296296,
"grad_norm": 0.18155204715794676,
"kl": 0.00012171268463134766,
"learning_rate": 8.235294117647058e-07,
"loss": -0.0196,
"reward": 1.2171168476343155,
"reward_std": 0.47040870040655136,
"rewards/accuracy_reward": 0.7321428954601288,
"rewards/cosine_scaled_reward": 0.4849740155041218,
"step": 28
},
{
"completion_length": 1759.1295776367188,
"epoch": 0.08592592592592592,
"grad_norm": 0.1660768801376248,
"kl": 0.00011658668518066406,
"learning_rate": 8.529411764705882e-07,
"loss": 0.0748,
"reward": 1.3603521585464478,
"reward_std": 0.4311341643333435,
"rewards/accuracy_reward": 0.816964328289032,
"rewards/cosine_scaled_reward": 0.5433878675103188,
"step": 29
},
{
"completion_length": 2200.4509887695312,
"epoch": 0.08888888888888889,
"grad_norm": 0.14464003989927063,
"kl": 0.00014543533325195312,
"learning_rate": 8.823529411764705e-07,
"loss": -0.0008,
"reward": 1.1178553104400635,
"reward_std": 0.41852162033319473,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/cosine_scaled_reward": 0.4303552433848381,
"step": 30
},
{
"completion_length": 2137.602813720703,
"epoch": 0.09185185185185185,
"grad_norm": 0.14000193999914415,
"kl": 0.00016105175018310547,
"learning_rate": 9.117647058823529e-07,
"loss": 0.0451,
"reward": 1.04503732919693,
"reward_std": 0.3138642441481352,
"rewards/accuracy_reward": 0.6607142984867096,
"rewards/cosine_scaled_reward": 0.38432304188609123,
"step": 31
},
{
"completion_length": 2145.90185546875,
"epoch": 0.09481481481481481,
"grad_norm": 0.14844474863429077,
"kl": 0.0001742839813232422,
"learning_rate": 9.411764705882352e-07,
"loss": 0.0578,
"reward": 1.2176358550786972,
"reward_std": 0.44562142342329025,
"rewards/accuracy_reward": 0.7321428954601288,
"rewards/cosine_scaled_reward": 0.4854929521679878,
"step": 32
},
{
"completion_length": 1576.7188568115234,
"epoch": 0.09777777777777778,
"grad_norm": 0.18427378846970008,
"kl": 0.000148773193359375,
"learning_rate": 9.705882352941176e-07,
"loss": 0.0295,
"reward": 1.5268434286117554,
"reward_std": 0.3122905343770981,
"rewards/accuracy_reward": 0.8794643431901932,
"rewards/cosine_scaled_reward": 0.6473791301250458,
"step": 33
},
{
"completion_length": 2476.5045166015625,
"epoch": 0.10074074074074074,
"grad_norm": 0.11568502492496938,
"kl": 0.0001811981201171875,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 0.9712123870849609,
"reward_std": 0.44801822677254677,
"rewards/accuracy_reward": 0.6339286118745804,
"rewards/cosine_scaled_reward": 0.33728381246328354,
"step": 34
},
{
"completion_length": 2399.5715942382812,
"epoch": 0.1037037037037037,
"grad_norm": 0.13094815811246613,
"kl": 0.00019669532775878906,
"learning_rate": 9.99975812381176e-07,
"loss": 0.047,
"reward": 0.9391455352306366,
"reward_std": 0.4169693812727928,
"rewards/accuracy_reward": 0.611607164144516,
"rewards/cosine_scaled_reward": 0.3275383338332176,
"step": 35
},
{
"completion_length": 1674.7232971191406,
"epoch": 0.10666666666666667,
"grad_norm": 0.15634669232229934,
"kl": 0.0001952648162841797,
"learning_rate": 9.999032521248854e-07,
"loss": 0.0049,
"reward": 1.107189193367958,
"reward_std": 0.51768758893013,
"rewards/accuracy_reward": 0.7187500298023224,
"rewards/cosine_scaled_reward": 0.38843920081853867,
"step": 36
},
{
"completion_length": 1777.2813415527344,
"epoch": 0.10962962962962963,
"grad_norm": 0.13993849205550643,
"kl": 0.00018024444580078125,
"learning_rate": 9.997823270313945e-07,
"loss": -0.0192,
"reward": 1.2558479607105255,
"reward_std": 0.41079528629779816,
"rewards/accuracy_reward": 0.7723214626312256,
"rewards/cosine_scaled_reward": 0.4835265427827835,
"step": 37
},
{
"completion_length": 1904.2232971191406,
"epoch": 0.11259259259259259,
"grad_norm": 0.18540633639301507,
"kl": 0.000263214111328125,
"learning_rate": 9.996130501002146e-07,
"loss": 0.0513,
"reward": 1.198735386133194,
"reward_std": 0.4133296310901642,
"rewards/accuracy_reward": 0.7008928805589676,
"rewards/cosine_scaled_reward": 0.4978425204753876,
"step": 38
},
{
"completion_length": 2139.1474609375,
"epoch": 0.11555555555555555,
"grad_norm": 0.17053105587422207,
"kl": 0.0003085136413574219,
"learning_rate": 9.99395439528705e-07,
"loss": 0.061,
"reward": 1.083926498889923,
"reward_std": 0.44088590145111084,
"rewards/accuracy_reward": 0.6562500298023224,
"rewards/cosine_scaled_reward": 0.4276764690876007,
"step": 39
},
{
"completion_length": 2316.5670471191406,
"epoch": 0.11851851851851852,
"grad_norm": 0.14433164353996455,
"kl": 0.0002789497375488281,
"learning_rate": 9.991295187101175e-07,
"loss": 0.0319,
"reward": 0.9402691721916199,
"reward_std": 0.5452702566981316,
"rewards/accuracy_reward": 0.6160714626312256,
"rewards/cosine_scaled_reward": 0.3241976648569107,
"step": 40
},
{
"completion_length": 1936.9599304199219,
"epoch": 0.12148148148148148,
"grad_norm": 0.15408529457273207,
"kl": 0.0003170967102050781,
"learning_rate": 9.988153162310798e-07,
"loss": -0.038,
"reward": 1.0872374176979065,
"reward_std": 0.3996356353163719,
"rewards/accuracy_reward": 0.6830357313156128,
"rewards/cosine_scaled_reward": 0.4042016677558422,
"step": 41
},
{
"completion_length": 1797.6518859863281,
"epoch": 0.12444444444444444,
"grad_norm": 0.1499032007227083,
"kl": 0.0004444122314453125,
"learning_rate": 9.98452865868525e-07,
"loss": 0.0005,
"reward": 1.3414774239063263,
"reward_std": 0.30734935216605663,
"rewards/accuracy_reward": 0.7901786118745804,
"rewards/cosine_scaled_reward": 0.5512988418340683,
"step": 42
},
{
"completion_length": 1430.83935546875,
"epoch": 0.1274074074074074,
"grad_norm": 0.21684453788165528,
"kl": 0.0005488395690917969,
"learning_rate": 9.980422065860585e-07,
"loss": 0.1485,
"reward": 1.433362364768982,
"reward_std": 0.3419237732887268,
"rewards/accuracy_reward": 0.8258928805589676,
"rewards/cosine_scaled_reward": 0.607469454407692,
"step": 43
},
{
"completion_length": 1854.7366638183594,
"epoch": 0.13037037037037036,
"grad_norm": 0.13990265672119997,
"kl": 0.000537872314453125,
"learning_rate": 9.975833825297694e-07,
"loss": -0.0013,
"reward": 1.2768487930297852,
"reward_std": 0.44341667741537094,
"rewards/accuracy_reward": 0.7723214626312256,
"rewards/cosine_scaled_reward": 0.5045273676514626,
"step": 44
},
{
"completion_length": 2298.52685546875,
"epoch": 0.13333333333333333,
"grad_norm": 0.14958567896728817,
"kl": 0.0005388259887695312,
"learning_rate": 9.970764430234865e-07,
"loss": 0.0167,
"reward": 0.8812481909990311,
"reward_std": 0.4946395307779312,
"rewards/accuracy_reward": 0.5714286044239998,
"rewards/cosine_scaled_reward": 0.30981964617967606,
"step": 45
},
{
"completion_length": 2024.4152526855469,
"epoch": 0.1362962962962963,
"grad_norm": 0.17201604459050468,
"kl": 0.0005998611450195312,
"learning_rate": 9.965214425634744e-07,
"loss": 0.0583,
"reward": 1.2592856585979462,
"reward_std": 0.40784522891044617,
"rewards/accuracy_reward": 0.7544643133878708,
"rewards/cosine_scaled_reward": 0.5048213675618172,
"step": 46
},
{
"completion_length": 1278.745620727539,
"epoch": 0.13925925925925925,
"grad_norm": 0.21231358687230578,
"kl": 0.001026153564453125,
"learning_rate": 9.959184408125757e-07,
"loss": 0.0103,
"reward": 1.3836407363414764,
"reward_std": 0.26683217100799084,
"rewards/accuracy_reward": 0.8303571790456772,
"rewards/cosine_scaled_reward": 0.5532835274934769,
"step": 47
},
{
"completion_length": 1805.4867248535156,
"epoch": 0.14222222222222222,
"grad_norm": 0.15498315626109,
"kl": 0.0008897781372070312,
"learning_rate": 9.952675025937969e-07,
"loss": 0.0114,
"reward": 1.0790625214576721,
"reward_std": 0.34751547686755657,
"rewards/accuracy_reward": 0.6741071790456772,
"rewards/cosine_scaled_reward": 0.40495534986257553,
"step": 48
},
{
"completion_length": 1740.5223999023438,
"epoch": 0.1451851851851852,
"grad_norm": 0.168185152396588,
"kl": 0.0010223388671875,
"learning_rate": 9.945686978833404e-07,
"loss": -0.0252,
"reward": 1.312384933233261,
"reward_std": 0.41872425377368927,
"rewards/accuracy_reward": 0.776785746216774,
"rewards/cosine_scaled_reward": 0.5355991423130035,
"step": 49
},
{
"completion_length": 2063.5000915527344,
"epoch": 0.14814814814814814,
"grad_norm": 0.15180436763056188,
"kl": 0.0008392333984375,
"learning_rate": 9.938221018030818e-07,
"loss": -0.0333,
"reward": 1.2742985486984253,
"reward_std": 0.44013547897338867,
"rewards/accuracy_reward": 0.7767857611179352,
"rewards/cosine_scaled_reward": 0.4975127503275871,
"step": 50
},
{
"completion_length": 2093.4598999023438,
"epoch": 0.1511111111111111,
"grad_norm": 0.15345817763060737,
"kl": 0.0010747909545898438,
"learning_rate": 9.930277946124936e-07,
"loss": 0.0163,
"reward": 1.179248034954071,
"reward_std": 0.3069497048854828,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/cosine_scaled_reward": 0.48281943798065186,
"step": 51
},
{
"completion_length": 1701.2678833007812,
"epoch": 0.15407407407407409,
"grad_norm": 0.49339086071731963,
"kl": 0.0011758804321289062,
"learning_rate": 9.921858617000186e-07,
"loss": 0.0374,
"reward": 1.399949163198471,
"reward_std": 0.41836266964673996,
"rewards/accuracy_reward": 0.8392857611179352,
"rewards/cosine_scaled_reward": 0.5606633871793747,
"step": 52
},
{
"completion_length": 1969.9330749511719,
"epoch": 0.15703703703703703,
"grad_norm": 0.15802222520538123,
"kl": 0.0012388229370117188,
"learning_rate": 9.912963935738895e-07,
"loss": 0.0166,
"reward": 1.2939041405916214,
"reward_std": 0.44797487556934357,
"rewards/accuracy_reward": 0.7678571790456772,
"rewards/cosine_scaled_reward": 0.5260469764471054,
"step": 53
},
{
"completion_length": 2150.6162109375,
"epoch": 0.16,
"grad_norm": 0.14836385455922624,
"kl": 0.0013933181762695312,
"learning_rate": 9.903594858523993e-07,
"loss": 0.0438,
"reward": 1.2006288468837738,
"reward_std": 0.38477384112775326,
"rewards/accuracy_reward": 0.7187500298023224,
"rewards/cosine_scaled_reward": 0.481878824532032,
"step": 54
},
{
"completion_length": 2185.0983276367188,
"epoch": 0.16296296296296298,
"grad_norm": 0.16192437178799024,
"kl": 0.001556396484375,
"learning_rate": 9.893752392536231e-07,
"loss": 0.0437,
"reward": 1.0603131651878357,
"reward_std": 0.4352143183350563,
"rewards/accuracy_reward": 0.6473214626312256,
"rewards/cosine_scaled_reward": 0.4129916988313198,
"step": 55
},
{
"completion_length": 2409.43310546875,
"epoch": 0.16592592592592592,
"grad_norm": 0.1424512212234098,
"kl": 0.001216888427734375,
"learning_rate": 9.883437595845901e-07,
"loss": 0.0462,
"reward": 0.998899444937706,
"reward_std": 0.4582284800708294,
"rewards/accuracy_reward": 0.629464328289032,
"rewards/cosine_scaled_reward": 0.3694351278245449,
"step": 56
},
{
"completion_length": 1719.02685546875,
"epoch": 0.1688888888888889,
"grad_norm": 0.1725324452769341,
"kl": 0.0020580291748046875,
"learning_rate": 9.872651577299092e-07,
"loss": 0.0393,
"reward": 1.3792076706886292,
"reward_std": 0.27744055911898613,
"rewards/accuracy_reward": 0.7991071939468384,
"rewards/cosine_scaled_reward": 0.5801005065441132,
"step": 57
},
{
"completion_length": 2229.2858276367188,
"epoch": 0.17185185185185184,
"grad_norm": 0.12530389854229343,
"kl": 0.0016937255859375,
"learning_rate": 9.861395496398497e-07,
"loss": 0.0243,
"reward": 1.0262050777673721,
"reward_std": 0.44111158698797226,
"rewards/accuracy_reward": 0.6473214626312256,
"rewards/cosine_scaled_reward": 0.378883657976985,
"step": 58
},
{
"completion_length": 1460.5536499023438,
"epoch": 0.1748148148148148,
"grad_norm": 0.17309982186787048,
"kl": 0.0025787353515625,
"learning_rate": 9.849670563178756e-07,
"loss": -0.0294,
"reward": 1.4032874703407288,
"reward_std": 0.42625588178634644,
"rewards/accuracy_reward": 0.8125000447034836,
"rewards/cosine_scaled_reward": 0.5907874628901482,
"step": 59
},
{
"completion_length": 1460.8750762939453,
"epoch": 0.17777777777777778,
"grad_norm": 0.1850063622528884,
"kl": 0.0031528472900390625,
"learning_rate": 9.83747803807638e-07,
"loss": 0.0411,
"reward": 1.3028863370418549,
"reward_std": 0.3087996020913124,
"rewards/accuracy_reward": 0.7589286267757416,
"rewards/cosine_scaled_reward": 0.5439577773213387,
"step": 60
},
{
"completion_length": 1489.8438110351562,
"epoch": 0.18074074074074073,
"grad_norm": 0.17806944014811066,
"kl": 0.00299835205078125,
"learning_rate": 9.82481923179426e-07,
"loss": 0.0554,
"reward": 1.4762336909770966,
"reward_std": 0.3607482761144638,
"rewards/accuracy_reward": 0.8348214626312256,
"rewards/cosine_scaled_reward": 0.6414122134447098,
"step": 61
},
{
"completion_length": 1651.0090026855469,
"epoch": 0.1837037037037037,
"grad_norm": 0.16163382863877152,
"kl": 0.0036773681640625,
"learning_rate": 9.811695505160755e-07,
"loss": 0.0263,
"reward": 1.3000611364841461,
"reward_std": 0.4057988375425339,
"rewards/accuracy_reward": 0.7812500447034836,
"rewards/cosine_scaled_reward": 0.5188110917806625,
"step": 62
},
{
"completion_length": 1794.1473999023438,
"epoch": 0.18666666666666668,
"grad_norm": 0.18644937494598265,
"kl": 0.003452301025390625,
"learning_rate": 9.79810826898341e-07,
"loss": 0.0748,
"reward": 1.2813213169574738,
"reward_std": 0.3088081255555153,
"rewards/accuracy_reward": 0.7410714626312256,
"rewards/cosine_scaled_reward": 0.5402498543262482,
"step": 63
},
{
"completion_length": 1571.9688110351562,
"epoch": 0.18962962962962962,
"grad_norm": 0.1853175840562683,
"kl": 0.003986358642578125,
"learning_rate": 9.784058983897284e-07,
"loss": 0.0044,
"reward": 1.292844980955124,
"reward_std": 0.3763498868793249,
"rewards/accuracy_reward": 0.754464328289032,
"rewards/cosine_scaled_reward": 0.5383806526660919,
"step": 64
},
{
"completion_length": 1578.4866943359375,
"epoch": 0.1925925925925926,
"grad_norm": 0.15904947917513404,
"kl": 0.0043487548828125,
"learning_rate": 9.769549160207952e-07,
"loss": 0.0214,
"reward": 1.1014029830694199,
"reward_std": 0.43177659809589386,
"rewards/accuracy_reward": 0.6741071492433548,
"rewards/cosine_scaled_reward": 0.42729582637548447,
"step": 65
},
{
"completion_length": 1857.227783203125,
"epoch": 0.19555555555555557,
"grad_norm": 0.1606816895577705,
"kl": 0.00336456298828125,
"learning_rate": 9.754580357729116e-07,
"loss": 0.0669,
"reward": 1.3189943879842758,
"reward_std": 0.3866711165755987,
"rewards/accuracy_reward": 0.7723214626312256,
"rewards/cosine_scaled_reward": 0.5466729700565338,
"step": 66
},
{
"completion_length": 1460.6875457763672,
"epoch": 0.1985185185185185,
"grad_norm": 0.19474404173280954,
"kl": 0.005466461181640625,
"learning_rate": 9.739154185614949e-07,
"loss": 0.0251,
"reward": 1.337535835802555,
"reward_std": 0.2742351219058037,
"rewards/accuracy_reward": 0.7633928880095482,
"rewards/cosine_scaled_reward": 0.5741428937762976,
"step": 67
},
{
"completion_length": 1658.80810546875,
"epoch": 0.20148148148148148,
"grad_norm": 0.1747359976489889,
"kl": 0.0050048828125,
"learning_rate": 9.723272302187106e-07,
"loss": -0.0185,
"reward": 1.2206310108304024,
"reward_std": 0.3201807998120785,
"rewards/accuracy_reward": 0.7098214700818062,
"rewards/cosine_scaled_reward": 0.510809512808919,
"step": 68
},
{
"completion_length": 1545.9509887695312,
"epoch": 0.20444444444444446,
"grad_norm": 0.16313740882439987,
"kl": 0.00656890869140625,
"learning_rate": 9.706936414756435e-07,
"loss": 0.0095,
"reward": 1.282968521118164,
"reward_std": 0.4132317379117012,
"rewards/accuracy_reward": 0.7187500298023224,
"rewards/cosine_scaled_reward": 0.5642184466123581,
"step": 69
},
{
"completion_length": 1925.4911804199219,
"epoch": 0.2074074074074074,
"grad_norm": 0.17244646258164678,
"kl": 0.0052947998046875,
"learning_rate": 9.69014827943947e-07,
"loss": 0.048,
"reward": 0.9535009413957596,
"reward_std": 0.36283179745078087,
"rewards/accuracy_reward": 0.6250000298023224,
"rewards/cosine_scaled_reward": 0.328500933945179,
"step": 70
},
{
"completion_length": 1576.6786346435547,
"epoch": 0.21037037037037037,
"grad_norm": 0.17458726146815962,
"kl": 0.006336212158203125,
"learning_rate": 9.672909700969612e-07,
"loss": 0.0494,
"reward": 1.3331668823957443,
"reward_std": 0.4533561021089554,
"rewards/accuracy_reward": 0.7767857313156128,
"rewards/cosine_scaled_reward": 0.5563811622560024,
"step": 71
},
{
"completion_length": 1622.2277221679688,
"epoch": 0.21333333333333335,
"grad_norm": 0.18810519087736544,
"kl": 0.00846099853515625,
"learning_rate": 9.65522253250316e-07,
"loss": 0.062,
"reward": 1.2747989892959595,
"reward_std": 0.3764601796865463,
"rewards/accuracy_reward": 0.7321428805589676,
"rewards/cosine_scaled_reward": 0.5426560789346695,
"step": 72
},
{
"completion_length": 1628.651870727539,
"epoch": 0.2162962962962963,
"grad_norm": 0.1913056968217199,
"kl": 0.0076446533203125,
"learning_rate": 9.637088675420063e-07,
"loss": 0.0442,
"reward": 1.371776431798935,
"reward_std": 0.4677218608558178,
"rewards/accuracy_reward": 0.7946428954601288,
"rewards/cosine_scaled_reward": 0.577133521437645,
"step": 73
},
{
"completion_length": 1836.1116943359375,
"epoch": 0.21925925925925926,
"grad_norm": 0.16928420028282418,
"kl": 0.00873565673828125,
"learning_rate": 9.618510079119533e-07,
"loss": 0.0887,
"reward": 1.2351452708244324,
"reward_std": 0.3623766005039215,
"rewards/accuracy_reward": 0.7276786118745804,
"rewards/cosine_scaled_reward": 0.5074666365981102,
"step": 74
},
{
"completion_length": 1350.9152526855469,
"epoch": 0.2222222222222222,
"grad_norm": 0.1893299932553231,
"kl": 0.009857177734375,
"learning_rate": 9.59948874081048e-07,
"loss": 0.0722,
"reward": 1.0651443749666214,
"reward_std": 0.486992284655571,
"rewards/accuracy_reward": 0.6651786118745804,
"rewards/cosine_scaled_reward": 0.3999657705426216,
"step": 75
},
{
"completion_length": 1341.65185546875,
"epoch": 0.22518518518518518,
"grad_norm": 0.19874429784134148,
"kl": 0.0109100341796875,
"learning_rate": 9.580026705296824e-07,
"loss": 0.0374,
"reward": 1.4165225625038147,
"reward_std": 0.3073725774884224,
"rewards/accuracy_reward": 0.7991071790456772,
"rewards/cosine_scaled_reward": 0.6174153983592987,
"step": 76
},
{
"completion_length": 1584.9866485595703,
"epoch": 0.22814814814814816,
"grad_norm": 0.192470093500917,
"kl": 0.0110626220703125,
"learning_rate": 9.56012606475766e-07,
"loss": 0.036,
"reward": 1.2434572279453278,
"reward_std": 0.42061255872249603,
"rewards/accuracy_reward": 0.7142857313156128,
"rewards/cosine_scaled_reward": 0.5291714444756508,
"step": 77
},
{
"completion_length": 1354.6607666015625,
"epoch": 0.2311111111111111,
"grad_norm": 0.20023302979275232,
"kl": 0.012298583984375,
"learning_rate": 9.539788958522353e-07,
"loss": 0.0618,
"reward": 1.3703485876321793,
"reward_std": 0.39164508879184723,
"rewards/accuracy_reward": 0.785714328289032,
"rewards/cosine_scaled_reward": 0.5846342295408249,
"step": 78
},
{
"completion_length": 1133.839340209961,
"epoch": 0.23407407407407407,
"grad_norm": 0.1842508631519829,
"kl": 0.0138397216796875,
"learning_rate": 9.519017572840562e-07,
"loss": 0.0505,
"reward": 1.3714804649353027,
"reward_std": 0.41867052018642426,
"rewards/accuracy_reward": 0.7857143133878708,
"rewards/cosine_scaled_reward": 0.5857661366462708,
"step": 79
},
{
"completion_length": 1497.263412475586,
"epoch": 0.23703703703703705,
"grad_norm": 0.1959550919083615,
"kl": 0.0127410888671875,
"learning_rate": 9.49781414064722e-07,
"loss": 0.0741,
"reward": 1.1733836829662323,
"reward_std": 0.5299587771296501,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.4635622203350067,
"step": 80
},
{
"completion_length": 1383.0313262939453,
"epoch": 0.24,
"grad_norm": 0.21509846180208117,
"kl": 0.0157470703125,
"learning_rate": 9.476180941322485e-07,
"loss": 0.0916,
"reward": 1.1244508922100067,
"reward_std": 0.31519509851932526,
"rewards/accuracy_reward": 0.6562500223517418,
"rewards/cosine_scaled_reward": 0.4682008996605873,
"step": 81
},
{
"completion_length": 1542.4465026855469,
"epoch": 0.24296296296296296,
"grad_norm": 0.17903856801808032,
"kl": 0.0135650634765625,
"learning_rate": 9.454120300446708e-07,
"loss": 0.1177,
"reward": 1.032856598496437,
"reward_std": 0.37768274173140526,
"rewards/accuracy_reward": 0.6160714328289032,
"rewards/cosine_scaled_reward": 0.4167851284146309,
"step": 82
},
{
"completion_length": 1229.1741638183594,
"epoch": 0.24592592592592594,
"grad_norm": 0.1895334108803511,
"kl": 0.0172882080078125,
"learning_rate": 9.431634589550437e-07,
"loss": 0.0994,
"reward": 1.2515216022729874,
"reward_std": 0.329837616533041,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.5417001619935036,
"step": 83
},
{
"completion_length": 1215.3705749511719,
"epoch": 0.24888888888888888,
"grad_norm": 0.25854148885682715,
"kl": 0.0201416015625,
"learning_rate": 9.408726225859463e-07,
"loss": -0.029,
"reward": 1.17463618516922,
"reward_std": 0.4334421083331108,
"rewards/accuracy_reward": 0.6651785969734192,
"rewards/cosine_scaled_reward": 0.5094575956463814,
"step": 84
},
{
"completion_length": 1066.089340209961,
"epoch": 0.2518518518518518,
"grad_norm": 0.2213349369656864,
"kl": 0.021392822265625,
"learning_rate": 9.385397672034984e-07,
"loss": 0.1045,
"reward": 1.4549466967582703,
"reward_std": 0.3001119792461395,
"rewards/accuracy_reward": 0.8125000447034836,
"rewards/cosine_scaled_reward": 0.6424466073513031,
"step": 85
},
{
"completion_length": 1157.5848693847656,
"epoch": 0.2548148148148148,
"grad_norm": 0.22308540448597394,
"kl": 0.024993896484375,
"learning_rate": 9.361651435908859e-07,
"loss": 0.0057,
"reward": 1.383677989244461,
"reward_std": 0.4137191101908684,
"rewards/accuracy_reward": 0.7633928954601288,
"rewards/cosine_scaled_reward": 0.6202851235866547,
"step": 86
},
{
"completion_length": 1113.575942993164,
"epoch": 0.2577777777777778,
"grad_norm": 0.2376225453935383,
"kl": 0.028778076171875,
"learning_rate": 9.337490070214005e-07,
"loss": 0.0684,
"reward": 1.496872365474701,
"reward_std": 0.41324392706155777,
"rewards/accuracy_reward": 0.8125000298023224,
"rewards/cosine_scaled_reward": 0.6843723505735397,
"step": 87
},
{
"completion_length": 944.7812957763672,
"epoch": 0.2607407407407407,
"grad_norm": 0.20249928269924236,
"kl": 0.02642822265625,
"learning_rate": 9.312916172309998e-07,
"loss": 0.0757,
"reward": 1.4803976714611053,
"reward_std": 0.3520447090268135,
"rewards/accuracy_reward": 0.8125000447034836,
"rewards/cosine_scaled_reward": 0.667897641658783,
"step": 88
},
{
"completion_length": 1494.6563110351562,
"epoch": 0.2637037037037037,
"grad_norm": 0.19755508814823958,
"kl": 0.02459716796875,
"learning_rate": 9.287932383903842e-07,
"loss": 0.0235,
"reward": 1.2657422125339508,
"reward_std": 0.35956617817282677,
"rewards/accuracy_reward": 0.7142857313156128,
"rewards/cosine_scaled_reward": 0.5514564663171768,
"step": 89
},
{
"completion_length": 1329.3214721679688,
"epoch": 0.26666666666666666,
"grad_norm": 0.21208392637462176,
"kl": 0.02642822265625,
"learning_rate": 9.262541390765981e-07,
"loss": 0.0319,
"reward": 1.1303415894508362,
"reward_std": 0.5117903053760529,
"rewards/accuracy_reward": 0.6607143133878708,
"rewards/cosine_scaled_reward": 0.4696272984147072,
"step": 90
},
{
"completion_length": 942.5982666015625,
"epoch": 0.2696296296296296,
"grad_norm": 0.24191683773068134,
"kl": 0.035491943359375,
"learning_rate": 9.236745922441589e-07,
"loss": 0.0795,
"reward": 1.430460512638092,
"reward_std": 0.3666737973690033,
"rewards/accuracy_reward": 0.7812500298023224,
"rewards/cosine_scaled_reward": 0.6492104828357697,
"step": 91
},
{
"completion_length": 1154.8437957763672,
"epoch": 0.2725925925925926,
"grad_norm": 0.19725864085814365,
"kl": 0.030853271484375,
"learning_rate": 9.210548751957133e-07,
"loss": -0.0129,
"reward": 1.3163889944553375,
"reward_std": 0.5869083181023598,
"rewards/accuracy_reward": 0.7410714626312256,
"rewards/cosine_scaled_reward": 0.5753175765275955,
"step": 92
},
{
"completion_length": 1329.2678833007812,
"epoch": 0.27555555555555555,
"grad_norm": 0.1823845016827509,
"kl": 0.03363037109375,
"learning_rate": 9.183952695522273e-07,
"loss": 0.0291,
"reward": 1.2691433429718018,
"reward_std": 0.4903425797820091,
"rewards/accuracy_reward": 0.7098214775323868,
"rewards/cosine_scaled_reward": 0.5593218728899956,
"step": 93
},
{
"completion_length": 1625.6563262939453,
"epoch": 0.2785185185185185,
"grad_norm": 0.2494826179007093,
"kl": 0.03369140625,
"learning_rate": 9.156960612227125e-07,
"loss": 0.0871,
"reward": 1.1840568780899048,
"reward_std": 0.4199274815618992,
"rewards/accuracy_reward": 0.6696428954601288,
"rewards/cosine_scaled_reward": 0.5144139900803566,
"step": 94
},
{
"completion_length": 1420.6339721679688,
"epoch": 0.2814814814814815,
"grad_norm": 0.2102867426975808,
"kl": 0.03350830078125,
"learning_rate": 9.129575403734897e-07,
"loss": 0.0645,
"reward": 1.2297062426805496,
"reward_std": 0.47686289995908737,
"rewards/accuracy_reward": 0.6964286118745804,
"rewards/cosine_scaled_reward": 0.5332776308059692,
"step": 95
},
{
"completion_length": 1161.6607818603516,
"epoch": 0.28444444444444444,
"grad_norm": 0.19499198339251772,
"kl": 0.041473388671875,
"learning_rate": 9.101800013969962e-07,
"loss": 0.0625,
"reward": 1.1435084491968155,
"reward_std": 0.4009154736995697,
"rewards/accuracy_reward": 0.647321455180645,
"rewards/cosine_scaled_reward": 0.496186975389719,
"step": 96
},
{
"completion_length": 1372.5357818603516,
"epoch": 0.2874074074074074,
"grad_norm": 0.19600482088111124,
"kl": 0.039581298828125,
"learning_rate": 9.07363742880139e-07,
"loss": 0.0203,
"reward": 1.3975748717784882,
"reward_std": 0.44245097786188126,
"rewards/accuracy_reward": 0.76339291036129,
"rewards/cosine_scaled_reward": 0.634181946516037,
"step": 97
},
{
"completion_length": 1010.8259429931641,
"epoch": 0.2903703703703704,
"grad_norm": 0.20258954456169906,
"kl": 0.04852294921875,
"learning_rate": 9.045090675721959e-07,
"loss": 0.0426,
"reward": 1.2528180032968521,
"reward_std": 0.5208085626363754,
"rewards/accuracy_reward": 0.7008928805589676,
"rewards/cosine_scaled_reward": 0.5519250854849815,
"step": 98
},
{
"completion_length": 1133.808090209961,
"epoch": 0.29333333333333333,
"grad_norm": 0.24010785767735876,
"kl": 0.0523681640625,
"learning_rate": 9.016162823522701e-07,
"loss": -0.0414,
"reward": 1.1999054104089737,
"reward_std": 0.40023650601506233,
"rewards/accuracy_reward": 0.6830357313156128,
"rewards/cosine_scaled_reward": 0.5168696194887161,
"step": 99
},
{
"completion_length": 868.0268249511719,
"epoch": 0.2962962962962963,
"grad_norm": 0.2556790055127662,
"kl": 0.05438232421875,
"learning_rate": 8.986856981963004e-07,
"loss": 0.0141,
"reward": 1.2974393367767334,
"reward_std": 0.42083971202373505,
"rewards/accuracy_reward": 0.7098214775323868,
"rewards/cosine_scaled_reward": 0.5876179337501526,
"step": 100
},
{
"completion_length": 1285.05810546875,
"epoch": 0.2992592592592593,
"grad_norm": 0.2672184140592102,
"kl": 0.0479736328125,
"learning_rate": 8.957176301436312e-07,
"loss": 0.0897,
"reward": 1.2107618898153305,
"reward_std": 0.4981778487563133,
"rewards/accuracy_reward": 0.6919643133878708,
"rewards/cosine_scaled_reward": 0.5187975913286209,
"step": 101
},
{
"completion_length": 1239.4420013427734,
"epoch": 0.3022222222222222,
"grad_norm": 0.26029036344061485,
"kl": 0.051483154296875,
"learning_rate": 8.927123972631457e-07,
"loss": 0.135,
"reward": 1.3805895149707794,
"reward_std": 0.4160504639148712,
"rewards/accuracy_reward": 0.7589286118745804,
"rewards/cosine_scaled_reward": 0.6216609328985214,
"step": 102
},
{
"completion_length": 779.2232666015625,
"epoch": 0.30518518518518517,
"grad_norm": 0.27072308594934286,
"kl": 0.06658935546875,
"learning_rate": 8.896703226189656e-07,
"loss": 0.0549,
"reward": 1.3102031350135803,
"reward_std": 0.4214232973754406,
"rewards/accuracy_reward": 0.7232143059372902,
"rewards/cosine_scaled_reward": 0.5869888141751289,
"step": 103
},
{
"completion_length": 1419.3795471191406,
"epoch": 0.30814814814814817,
"grad_norm": 0.24404839226709768,
"kl": 0.05548095703125,
"learning_rate": 8.865917332357217e-07,
"loss": -0.0445,
"reward": 1.1191436797380447,
"reward_std": 0.5001409500837326,
"rewards/accuracy_reward": 0.6383928805589676,
"rewards/cosine_scaled_reward": 0.48075081408023834,
"step": 104
},
{
"completion_length": 1380.4598846435547,
"epoch": 0.3111111111111111,
"grad_norm": 0.22891576288171045,
"kl": 0.0596923828125,
"learning_rate": 8.834769600633986e-07,
"loss": 0.0395,
"reward": 1.190809726715088,
"reward_std": 0.4635982885956764,
"rewards/accuracy_reward": 0.6607143133878708,
"rewards/cosine_scaled_reward": 0.5300954133272171,
"step": 105
},
{
"completion_length": 1261.1250610351562,
"epoch": 0.31407407407407406,
"grad_norm": 0.2543103499852788,
"kl": 0.072265625,
"learning_rate": 8.803263379417572e-07,
"loss": 0.0503,
"reward": 1.297232449054718,
"reward_std": 0.4232187941670418,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.5874110013246536,
"step": 106
},
{
"completion_length": 1787.1965026855469,
"epoch": 0.31703703703703706,
"grad_norm": 0.16701870028955876,
"kl": 0.042205810546875,
"learning_rate": 8.771402055643391e-07,
"loss": 0.0263,
"reward": 0.8708714246749878,
"reward_std": 0.5340973809361458,
"rewards/accuracy_reward": 0.535714328289032,
"rewards/cosine_scaled_reward": 0.3351571261882782,
"step": 107
},
{
"completion_length": 1069.0223770141602,
"epoch": 0.32,
"grad_norm": 0.245970485062095,
"kl": 0.0693359375,
"learning_rate": 8.73918905442058e-07,
"loss": 0.1147,
"reward": 1.1969991326332092,
"reward_std": 0.3443680591881275,
"rewards/accuracy_reward": 0.6562500298023224,
"rewards/cosine_scaled_reward": 0.5407490879297256,
"step": 108
},
{
"completion_length": 1027.7768249511719,
"epoch": 0.32296296296296295,
"grad_norm": 0.2561486876196046,
"kl": 0.0853271484375,
"learning_rate": 8.706627838663782e-07,
"loss": 0.0559,
"reward": 1.1508228331804276,
"reward_std": 0.5210666060447693,
"rewards/accuracy_reward": 0.6339285969734192,
"rewards/cosine_scaled_reward": 0.5168942138552666,
"step": 109
},
{
"completion_length": 1099.7411499023438,
"epoch": 0.32592592592592595,
"grad_norm": 0.25517403843187497,
"kl": 0.06573486328125,
"learning_rate": 8.673721908720884e-07,
"loss": 0.0741,
"reward": 1.2350784838199615,
"reward_std": 0.4044996239244938,
"rewards/accuracy_reward": 0.6875000447034836,
"rewards/cosine_scaled_reward": 0.5475784614682198,
"step": 110
},
{
"completion_length": 1189.6741638183594,
"epoch": 0.3288888888888889,
"grad_norm": 0.23395331806651343,
"kl": 0.0775146484375,
"learning_rate": 8.640474801996732e-07,
"loss": 0.0665,
"reward": 1.3876985013484955,
"reward_std": 0.3757442235946655,
"rewards/accuracy_reward": 0.7678571790456772,
"rewards/cosine_scaled_reward": 0.6198412925004959,
"step": 111
},
{
"completion_length": 977.1652145385742,
"epoch": 0.33185185185185184,
"grad_norm": 0.28043157986328343,
"kl": 0.088134765625,
"learning_rate": 8.606890092572861e-07,
"loss": -0.0177,
"reward": 1.1703044474124908,
"reward_std": 0.5287574678659439,
"rewards/accuracy_reward": 0.6428571790456772,
"rewards/cosine_scaled_reward": 0.5274473056197166,
"step": 112
},
{
"completion_length": 1058.357162475586,
"epoch": 0.3348148148148148,
"grad_norm": 0.28507497649017294,
"kl": 0.08673095703125,
"learning_rate": 8.572971390823266e-07,
"loss": 0.0739,
"reward": 1.270705059170723,
"reward_std": 0.5322270393371582,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/cosine_scaled_reward": 0.5742765069007874,
"step": 113
},
{
"completion_length": 1392.9152374267578,
"epoch": 0.3377777777777778,
"grad_norm": 0.22063315207690926,
"kl": 0.0809326171875,
"learning_rate": 8.538722343026302e-07,
"loss": 0.0446,
"reward": 1.007172241806984,
"reward_std": 0.37207865715026855,
"rewards/accuracy_reward": 0.5625000223517418,
"rewards/cosine_scaled_reward": 0.44467223435640335,
"step": 114
},
{
"completion_length": 859.0893211364746,
"epoch": 0.34074074074074073,
"grad_norm": 0.300236056528611,
"kl": 0.1016845703125,
"learning_rate": 8.50414663097269e-07,
"loss": 0.0039,
"reward": 1.4534207880496979,
"reward_std": 0.5292069166898727,
"rewards/accuracy_reward": 0.776785746216774,
"rewards/cosine_scaled_reward": 0.6766350567340851,
"step": 115
},
{
"completion_length": 1658.6161651611328,
"epoch": 0.3437037037037037,
"grad_norm": 0.21810828567849236,
"kl": 0.0694580078125,
"learning_rate": 8.46924797156974e-07,
"loss": 0.0056,
"reward": 1.076892763376236,
"reward_std": 0.5404257103800774,
"rewards/accuracy_reward": 0.6116071790456772,
"rewards/cosine_scaled_reward": 0.46528560668230057,
"step": 116
},
{
"completion_length": 1132.7455749511719,
"epoch": 0.3466666666666667,
"grad_norm": 0.2561231185493826,
"kl": 0.0867919921875,
"learning_rate": 8.434030116441765e-07,
"loss": 0.0196,
"reward": 1.271399825811386,
"reward_std": 0.5448361113667488,
"rewards/accuracy_reward": 0.705357164144516,
"rewards/cosine_scaled_reward": 0.5660426765680313,
"step": 117
},
{
"completion_length": 1571.7902526855469,
"epoch": 0.3496296296296296,
"grad_norm": 0.23889886690383044,
"kl": 0.07391357421875,
"learning_rate": 8.39849685152679e-07,
"loss": 0.0414,
"reward": 1.0188361555337906,
"reward_std": 0.5471197664737701,
"rewards/accuracy_reward": 0.584821455180645,
"rewards/cosine_scaled_reward": 0.4340147264301777,
"step": 118
},
{
"completion_length": 938.7991333007812,
"epoch": 0.35259259259259257,
"grad_norm": 0.26749074498213615,
"kl": 0.0892333984375,
"learning_rate": 8.36265199666956e-07,
"loss": 0.0489,
"reward": 1.4469529390335083,
"reward_std": 0.43392040487378836,
"rewards/accuracy_reward": 0.7812500298023224,
"rewards/cosine_scaled_reward": 0.6657029464840889,
"step": 119
},
{
"completion_length": 842.6027221679688,
"epoch": 0.35555555555555557,
"grad_norm": 0.2779316922692756,
"kl": 0.1029052734375,
"learning_rate": 8.326499405210902e-07,
"loss": 0.0706,
"reward": 1.3225018680095673,
"reward_std": 0.5245833843946457,
"rewards/accuracy_reward": 0.7008928954601288,
"rewards/cosine_scaled_reward": 0.6216090172529221,
"step": 120
},
{
"completion_length": 1482.9911193847656,
"epoch": 0.3585185185185185,
"grad_norm": 0.20900230828152339,
"kl": 0.06805419921875,
"learning_rate": 8.290042963573488e-07,
"loss": -0.0292,
"reward": 1.2458033114671707,
"reward_std": 0.586730495095253,
"rewards/accuracy_reward": 0.6875000447034836,
"rewards/cosine_scaled_reward": 0.5583032667636871,
"step": 121
},
{
"completion_length": 899.1562805175781,
"epoch": 0.36148148148148146,
"grad_norm": 0.27477236439012975,
"kl": 0.1024169921875,
"learning_rate": 8.25328659084405e-07,
"loss": 0.0473,
"reward": 1.3379029631614685,
"reward_std": 0.3929406702518463,
"rewards/accuracy_reward": 0.7276785969734192,
"rewards/cosine_scaled_reward": 0.6102243810892105,
"step": 122
},
{
"completion_length": 1149.5938262939453,
"epoch": 0.36444444444444446,
"grad_norm": 0.26020835723478647,
"kl": 0.082275390625,
"learning_rate": 8.216234238352065e-07,
"loss": 0.0949,
"reward": 1.515016108751297,
"reward_std": 0.453274130821228,
"rewards/accuracy_reward": 0.808035746216774,
"rewards/cosine_scaled_reward": 0.7069803923368454,
"step": 123
},
{
"completion_length": 1142.1786193847656,
"epoch": 0.3674074074074074,
"grad_norm": 0.3235095014491872,
"kl": 0.11279296875,
"learning_rate": 8.178889889244996e-07,
"loss": 0.0702,
"reward": 1.3975563943386078,
"reward_std": 0.2819400802254677,
"rewards/accuracy_reward": 0.7500000298023224,
"rewards/cosine_scaled_reward": 0.6475563049316406,
"step": 124
},
{
"completion_length": 876.8393249511719,
"epoch": 0.37037037037037035,
"grad_norm": 0.27873510040561517,
"kl": 0.1097412109375,
"learning_rate": 8.141257558060092e-07,
"loss": -0.005,
"reward": 1.4616929292678833,
"reward_std": 0.5415130406618118,
"rewards/accuracy_reward": 0.7678571790456772,
"rewards/cosine_scaled_reward": 0.6938357651233673,
"step": 125
},
{
"completion_length": 1556.3661499023438,
"epoch": 0.37333333333333335,
"grad_norm": 0.22490676274021995,
"kl": 0.06610107421875,
"learning_rate": 8.103341290292833e-07,
"loss": -0.0147,
"reward": 0.9707639068365097,
"reward_std": 0.667090579867363,
"rewards/accuracy_reward": 0.5714285895228386,
"rewards/cosine_scaled_reward": 0.39933526888489723,
"step": 126
},
{
"completion_length": 1085.2723693847656,
"epoch": 0.3762962962962963,
"grad_norm": 0.2609771942189481,
"kl": 0.087646484375,
"learning_rate": 8.065145161962021e-07,
"loss": 0.0467,
"reward": 1.2797034680843353,
"reward_std": 0.5504499524831772,
"rewards/accuracy_reward": 0.723214328289032,
"rewards/cosine_scaled_reward": 0.5564891993999481,
"step": 127
},
{
"completion_length": 1202.2411193847656,
"epoch": 0.37925925925925924,
"grad_norm": 0.23147646598137125,
"kl": 0.08319091796875,
"learning_rate": 8.02667327917163e-07,
"loss": -0.0018,
"reward": 1.2336037755012512,
"reward_std": 0.5526604950428009,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/cosine_scaled_reward": 0.5371751636266708,
"step": 128
},
{
"completion_length": 679.9955673217773,
"epoch": 0.38222222222222224,
"grad_norm": 0.6332260844439259,
"kl": 0.1307373046875,
"learning_rate": 7.987929777669372e-07,
"loss": -0.0718,
"reward": 1.3172721862792969,
"reward_std": 0.49390799552202225,
"rewards/accuracy_reward": 0.6964286118745804,
"rewards/cosine_scaled_reward": 0.6208435744047165,
"step": 129
},
{
"completion_length": 1333.0089721679688,
"epoch": 0.3851851851851852,
"grad_norm": 0.2913669404852735,
"kl": 0.0980224609375,
"learning_rate": 7.948918822402123e-07,
"loss": 0.0561,
"reward": 1.1632278561592102,
"reward_std": 0.45225123316049576,
"rewards/accuracy_reward": 0.6383928805589676,
"rewards/cosine_scaled_reward": 0.5248349532485008,
"step": 130
},
{
"completion_length": 1337.575942993164,
"epoch": 0.38814814814814813,
"grad_norm": 0.2186167579316726,
"kl": 0.08392333984375,
"learning_rate": 7.909644607068174e-07,
"loss": 0.001,
"reward": 1.1138341426849365,
"reward_std": 0.5587991923093796,
"rewards/accuracy_reward": 0.6339286118745804,
"rewards/cosine_scaled_reward": 0.47990551590919495,
"step": 131
},
{
"completion_length": 1420.0491790771484,
"epoch": 0.39111111111111113,
"grad_norm": 0.2919760236776904,
"kl": 0.09759521484375,
"learning_rate": 7.870111353666414e-07,
"loss": 0.0648,
"reward": 1.2764753997325897,
"reward_std": 0.5266730934381485,
"rewards/accuracy_reward": 0.7053571790456772,
"rewards/cosine_scaled_reward": 0.5711181536316872,
"step": 132
},
{
"completion_length": 1444.946517944336,
"epoch": 0.3940740740740741,
"grad_norm": 0.25522552398542675,
"kl": 0.08349609375,
"learning_rate": 7.830323312042464e-07,
"loss": 0.0683,
"reward": 1.188013032078743,
"reward_std": 0.37928835675120354,
"rewards/accuracy_reward": 0.6696428805589676,
"rewards/cosine_scaled_reward": 0.5183701142668724,
"step": 133
},
{
"completion_length": 1024.1027145385742,
"epoch": 0.397037037037037,
"grad_norm": 0.26298616639534006,
"kl": 0.10247802734375,
"learning_rate": 7.790284759431809e-07,
"loss": 0.0189,
"reward": 1.3900260627269745,
"reward_std": 0.30807338282465935,
"rewards/accuracy_reward": 0.7500000298023224,
"rewards/cosine_scaled_reward": 0.6400260329246521,
"step": 134
},
{
"completion_length": 1421.58935546875,
"epoch": 0.4,
"grad_norm": 0.24721365088048822,
"kl": 0.07550048828125,
"learning_rate": 7.75e-07,
"loss": 0.0589,
"reward": 1.1417311877012253,
"reward_std": 0.33604446426033974,
"rewards/accuracy_reward": 0.6339285969734192,
"rewards/cosine_scaled_reward": 0.5078025981783867,
"step": 135
},
{
"completion_length": 1269.888442993164,
"epoch": 0.40296296296296297,
"grad_norm": 0.27976499664133275,
"kl": 0.104248046875,
"learning_rate": 7.709473364379949e-07,
"loss": 0.0435,
"reward": 1.158977895975113,
"reward_std": 0.3557581529021263,
"rewards/accuracy_reward": 0.6473214626312256,
"rewards/cosine_scaled_reward": 0.5116564705967903,
"step": 136
},
{
"completion_length": 1240.1295318603516,
"epoch": 0.4059259259259259,
"grad_norm": 0.26528035666816113,
"kl": 0.117431640625,
"learning_rate": 7.668709209206391e-07,
"loss": -0.0069,
"reward": 1.4490948617458344,
"reward_std": 0.47412872314453125,
"rewards/accuracy_reward": 0.7812500298023224,
"rewards/cosine_scaled_reward": 0.6678448840975761,
"step": 137
},
{
"completion_length": 1057.0848693847656,
"epoch": 0.4088888888888889,
"grad_norm": 0.2625748064905278,
"kl": 0.10546875,
"learning_rate": 7.627711916647531e-07,
"loss": 0.0357,
"reward": 1.233490526676178,
"reward_std": 0.48493514209985733,
"rewards/accuracy_reward": 0.683035746216774,
"rewards/cosine_scaled_reward": 0.5504548028111458,
"step": 138
},
{
"completion_length": 1540.4732666015625,
"epoch": 0.41185185185185186,
"grad_norm": 0.3012629488379583,
"kl": 0.08453369140625,
"learning_rate": 7.586485893933972e-07,
"loss": 0.056,
"reward": 1.2701046466827393,
"reward_std": 0.5633516684174538,
"rewards/accuracy_reward": 0.7053571790456772,
"rewards/cosine_scaled_reward": 0.5647474825382233,
"step": 139
},
{
"completion_length": 1508.0804138183594,
"epoch": 0.4148148148148148,
"grad_norm": 0.24916923862969306,
"kl": 0.10467529296875,
"learning_rate": 7.545035572884928e-07,
"loss": -0.0049,
"reward": 1.271433025598526,
"reward_std": 0.4527568593621254,
"rewards/accuracy_reward": 0.7232143133878708,
"rewards/cosine_scaled_reward": 0.5482186861336231,
"step": 140
},
{
"completion_length": 1373.6786193847656,
"epoch": 0.4177777777777778,
"grad_norm": 0.26167485604048296,
"kl": 0.1085205078125,
"learning_rate": 7.503365409431801e-07,
"loss": 0.0049,
"reward": 1.2524654418230057,
"reward_std": 0.47800225764513016,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/cosine_scaled_reward": 0.5649654343724251,
"step": 141
},
{
"completion_length": 902.8259124755859,
"epoch": 0.42074074074074075,
"grad_norm": 0.36269394404753696,
"kl": 0.154296875,
"learning_rate": 7.46147988313917e-07,
"loss": 0.0854,
"reward": 1.6476789712905884,
"reward_std": 0.4029630944132805,
"rewards/accuracy_reward": 0.8616071790456772,
"rewards/cosine_scaled_reward": 0.7860718369483948,
"step": 142
},
{
"completion_length": 1099.325942993164,
"epoch": 0.4237037037037037,
"grad_norm": 0.33429217324642896,
"kl": 0.1253662109375,
"learning_rate": 7.419383496723229e-07,
"loss": 0.0115,
"reward": 1.1484860479831696,
"reward_std": 0.49667520076036453,
"rewards/accuracy_reward": 0.6383928805589676,
"rewards/cosine_scaled_reward": 0.510093130171299,
"step": 143
},
{
"completion_length": 1838.6786499023438,
"epoch": 0.4266666666666667,
"grad_norm": 0.25993606256207835,
"kl": 0.08837890625,
"learning_rate": 7.377080775567751e-07,
"loss": 0.1217,
"reward": 0.9903404861688614,
"reward_std": 0.387987844645977,
"rewards/accuracy_reward": 0.558035746216774,
"rewards/cosine_scaled_reward": 0.4323047176003456,
"step": 144
},
{
"completion_length": 953.6339797973633,
"epoch": 0.42962962962962964,
"grad_norm": 0.2719750141796096,
"kl": 0.1390380859375,
"learning_rate": 7.334576267237599e-07,
"loss": 0.0446,
"reward": 1.3718172013759613,
"reward_std": 0.475243978202343,
"rewards/accuracy_reward": 0.7321428805589676,
"rewards/cosine_scaled_reward": 0.6396742761135101,
"step": 145
},
{
"completion_length": 1215.2634582519531,
"epoch": 0.4325925925925926,
"grad_norm": 0.27796212001837417,
"kl": 0.111572265625,
"learning_rate": 7.291874540989869e-07,
"loss": -0.0992,
"reward": 1.2015317529439926,
"reward_std": 0.618925541639328,
"rewards/accuracy_reward": 0.660714328289032,
"rewards/cosine_scaled_reward": 0.5408174768090248,
"step": 146
},
{
"completion_length": 1392.4286499023438,
"epoch": 0.43555555555555553,
"grad_norm": 0.28571629899742523,
"kl": 0.123046875,
"learning_rate": 7.248980187282679e-07,
"loss": 0.0525,
"reward": 1.1790579408407211,
"reward_std": 0.47974304109811783,
"rewards/accuracy_reward": 0.6607142984867096,
"rewards/cosine_scaled_reward": 0.5183436721563339,
"step": 147
},
{
"completion_length": 1062.90185546875,
"epoch": 0.43851851851851853,
"grad_norm": 0.29903621450951867,
"kl": 0.1376953125,
"learning_rate": 7.205897817281707e-07,
"loss": 0.0587,
"reward": 1.2066510319709778,
"reward_std": 0.550245389342308,
"rewards/accuracy_reward": 0.6607143133878708,
"rewards/cosine_scaled_reward": 0.545936681330204,
"step": 148
},
{
"completion_length": 781.2277069091797,
"epoch": 0.4414814814814815,
"grad_norm": 0.3594108148854528,
"kl": 0.173583984375,
"learning_rate": 7.162632062364482e-07,
"loss": 0.0419,
"reward": 1.5628540217876434,
"reward_std": 0.3035361301153898,
"rewards/accuracy_reward": 0.8214286118745804,
"rewards/cosine_scaled_reward": 0.741425409913063,
"step": 149
},
{
"completion_length": 1027.4375610351562,
"epoch": 0.4444444444444444,
"grad_norm": 0.40168570307081203,
"kl": 0.179443359375,
"learning_rate": 7.119187573622503e-07,
"loss": 0.0299,
"reward": 1.208786502480507,
"reward_std": 0.4775058552622795,
"rewards/accuracy_reward": 0.647321455180645,
"rewards/cosine_scaled_reward": 0.5614650174975395,
"step": 150
},
{
"completion_length": 1068.450912475586,
"epoch": 0.4474074074074074,
"grad_norm": 0.36202416334148413,
"kl": 0.154541015625,
"learning_rate": 7.075569021361258e-07,
"loss": -0.0604,
"reward": 1.1638824492692947,
"reward_std": 0.6943438649177551,
"rewards/accuracy_reward": 0.6339285969734192,
"rewards/cosine_scaled_reward": 0.5299538299441338,
"step": 151
},
{
"completion_length": 1558.2411499023438,
"epoch": 0.45037037037037037,
"grad_norm": 0.3270636332627284,
"kl": 0.1575927734375,
"learning_rate": 7.031781094598147e-07,
"loss": 0.0083,
"reward": 1.0367525964975357,
"reward_std": 0.5159935727715492,
"rewards/accuracy_reward": 0.566964328289032,
"rewards/cosine_scaled_reward": 0.4697883054614067,
"step": 152
},
{
"completion_length": 1267.9375457763672,
"epoch": 0.4533333333333333,
"grad_norm": 0.2601830199231623,
"kl": 0.142333984375,
"learning_rate": 6.987828500558422e-07,
"loss": 0.0311,
"reward": 1.200500175356865,
"reward_std": 0.4817045107483864,
"rewards/accuracy_reward": 0.6562500298023224,
"rewards/cosine_scaled_reward": 0.5442501083016396,
"step": 153
},
{
"completion_length": 1155.1473693847656,
"epoch": 0.4562962962962963,
"grad_norm": 0.3657929469140216,
"kl": 0.166748046875,
"learning_rate": 6.943715964169153e-07,
"loss": 0.0741,
"reward": 1.30518639087677,
"reward_std": 0.4314998611807823,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.5953649058938026,
"step": 154
},
{
"completion_length": 1294.2500305175781,
"epoch": 0.45925925925925926,
"grad_norm": 0.3025845364980038,
"kl": 0.1407470703125,
"learning_rate": 6.899448227551302e-07,
"loss": 0.0423,
"reward": 1.4187033772468567,
"reward_std": 0.47818124294281006,
"rewards/accuracy_reward": 0.7589286267757416,
"rewards/cosine_scaled_reward": 0.6597748026251793,
"step": 155
},
{
"completion_length": 1628.3482971191406,
"epoch": 0.4622222222222222,
"grad_norm": 0.3033982983842964,
"kl": 0.1275634765625,
"learning_rate": 6.85503004950993e-07,
"loss": -0.0077,
"reward": 1.1377353817224503,
"reward_std": 0.5007912814617157,
"rewards/accuracy_reward": 0.6428571790456772,
"rewards/cosine_scaled_reward": 0.4948781877756119,
"step": 156
},
{
"completion_length": 1181.90629196167,
"epoch": 0.4651851851851852,
"grad_norm": 0.31424031764619936,
"kl": 0.1624755859375,
"learning_rate": 6.810466205022635e-07,
"loss": 0.0409,
"reward": 1.3390273749828339,
"reward_std": 0.41271302849054337,
"rewards/accuracy_reward": 0.7500000298023224,
"rewards/cosine_scaled_reward": 0.5890273749828339,
"step": 157
},
{
"completion_length": 1074.7947082519531,
"epoch": 0.46814814814814815,
"grad_norm": 0.35310245273581153,
"kl": 0.1787109375,
"learning_rate": 6.765761484726232e-07,
"loss": -0.0245,
"reward": 1.2675886452198029,
"reward_std": 0.5314782559871674,
"rewards/accuracy_reward": 0.683035746216774,
"rewards/cosine_scaled_reward": 0.5845528990030289,
"step": 158
},
{
"completion_length": 1470.8170166015625,
"epoch": 0.4711111111111111,
"grad_norm": 0.31730711857033983,
"kl": 0.12823486328125,
"learning_rate": 6.720920694401765e-07,
"loss": 0.011,
"reward": 1.2019407004117966,
"reward_std": 0.5537143424153328,
"rewards/accuracy_reward": 0.6562500447034836,
"rewards/cosine_scaled_reward": 0.5456906482577324,
"step": 159
},
{
"completion_length": 1101.3482666015625,
"epoch": 0.4740740740740741,
"grad_norm": 0.3749343248268511,
"kl": 0.1827392578125,
"learning_rate": 6.675948654457873e-07,
"loss": 0.0726,
"reward": 1.254590556025505,
"reward_std": 0.40981949865818024,
"rewards/accuracy_reward": 0.6696428805589676,
"rewards/cosine_scaled_reward": 0.5849476233124733,
"step": 160
},
{
"completion_length": 864.892894744873,
"epoch": 0.47703703703703704,
"grad_norm": 0.4140143134046075,
"kl": 0.224365234375,
"learning_rate": 6.6308501994126e-07,
"loss": 0.07,
"reward": 1.4429502189159393,
"reward_std": 0.4082343354821205,
"rewards/accuracy_reward": 0.7678571790456772,
"rewards/cosine_scaled_reward": 0.6750930473208427,
"step": 161
},
{
"completion_length": 1032.7545013427734,
"epoch": 0.48,
"grad_norm": 0.38927414518235437,
"kl": 0.18310546875,
"learning_rate": 6.585630177373679e-07,
"loss": 0.0317,
"reward": 1.3784838616847992,
"reward_std": 0.6301179528236389,
"rewards/accuracy_reward": 0.736607164144516,
"rewards/cosine_scaled_reward": 0.641876682639122,
"step": 162
},
{
"completion_length": 1148.0491790771484,
"epoch": 0.482962962962963,
"grad_norm": 0.37818983157554104,
"kl": 0.20947265625,
"learning_rate": 6.540293449517364e-07,
"loss": 0.1298,
"reward": 1.0715374201536179,
"reward_std": 0.560625359416008,
"rewards/accuracy_reward": 0.6026786044239998,
"rewards/cosine_scaled_reward": 0.4688587933778763,
"step": 163
},
{
"completion_length": 1020.1384429931641,
"epoch": 0.48592592592592593,
"grad_norm": 0.5055960507298748,
"kl": 0.2374267578125,
"learning_rate": 6.494844889565838e-07,
"loss": -0.0305,
"reward": 1.3220538794994354,
"reward_std": 0.5499390736222267,
"rewards/accuracy_reward": 0.7008928805589676,
"rewards/cosine_scaled_reward": 0.6211609840393066,
"step": 164
},
{
"completion_length": 1183.3348693847656,
"epoch": 0.4888888888888889,
"grad_norm": 0.43271458690777626,
"kl": 0.197998046875,
"learning_rate": 6.449289383263299e-07,
"loss": 0.0018,
"reward": 1.2181346118450165,
"reward_std": 0.5314186587929726,
"rewards/accuracy_reward": 0.660714328289032,
"rewards/cosine_scaled_reward": 0.5574202537536621,
"step": 165
},
{
"completion_length": 1187.1339721679688,
"epoch": 0.4918518518518519,
"grad_norm": 0.34390574748275693,
"kl": 0.18408203125,
"learning_rate": 6.403631827850733e-07,
"loss": -0.001,
"reward": 1.299418330192566,
"reward_std": 0.4987459257245064,
"rewards/accuracy_reward": 0.7098214775323868,
"rewards/cosine_scaled_reward": 0.5895968675613403,
"step": 166
},
{
"completion_length": 1049.1652221679688,
"epoch": 0.4948148148148148,
"grad_norm": 0.43196078764273776,
"kl": 0.233154296875,
"learning_rate": 6.357877131539459e-07,
"loss": -0.0543,
"reward": 1.2178914546966553,
"reward_std": 0.6870964467525482,
"rewards/accuracy_reward": 0.651785746216774,
"rewards/cosine_scaled_reward": 0.5661056637763977,
"step": 167
},
{
"completion_length": 889.2053985595703,
"epoch": 0.49777777777777776,
"grad_norm": 0.5031433742478459,
"kl": 0.280517578125,
"learning_rate": 6.312030212983492e-07,
"loss": 0.0753,
"reward": 1.401370495557785,
"reward_std": 0.5911066308617592,
"rewards/accuracy_reward": 0.736607164144516,
"rewards/cosine_scaled_reward": 0.6647634506225586,
"step": 168
},
{
"completion_length": 947.0134506225586,
"epoch": 0.5007407407407407,
"grad_norm": 0.33802955155948805,
"kl": 0.206787109375,
"learning_rate": 6.266096000750794e-07,
"loss": -0.0208,
"reward": 1.2584110498428345,
"reward_std": 0.5075518116354942,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/cosine_scaled_reward": 0.5798396170139313,
"step": 169
},
{
"completion_length": 1148.357192993164,
"epoch": 0.5037037037037037,
"grad_norm": 0.3869037274705203,
"kl": 0.21875,
"learning_rate": 6.220079432793434e-07,
"loss": -0.06,
"reward": 1.2284764647483826,
"reward_std": 0.4915821775794029,
"rewards/accuracy_reward": 0.674107164144516,
"rewards/cosine_scaled_reward": 0.5543693378567696,
"step": 170
},
{
"completion_length": 1371.2634582519531,
"epoch": 0.5066666666666667,
"grad_norm": 0.3786413188449613,
"kl": 0.236572265625,
"learning_rate": 6.173985455916767e-07,
"loss": 0.0856,
"reward": 1.22401562333107,
"reward_std": 0.44081344455480576,
"rewards/accuracy_reward": 0.6696428954601288,
"rewards/cosine_scaled_reward": 0.5543726608157158,
"step": 171
},
{
"completion_length": 1635.5670166015625,
"epoch": 0.5096296296296297,
"grad_norm": 0.3542416380801021,
"kl": 0.1768798828125,
"learning_rate": 6.127819025247654e-07,
"loss": -0.021,
"reward": 0.9518559873104095,
"reward_std": 0.6247570067644119,
"rewards/accuracy_reward": 0.5446428805589676,
"rewards/cosine_scaled_reward": 0.40721310302615166,
"step": 172
},
{
"completion_length": 696.8839721679688,
"epoch": 0.5125925925925926,
"grad_norm": 0.44284735926034635,
"kl": 0.224609375,
"learning_rate": 6.081585103701769e-07,
"loss": 0.0629,
"reward": 1.3010612279176712,
"reward_std": 0.4532425180077553,
"rewards/accuracy_reward": 0.7008928954601288,
"rewards/cosine_scaled_reward": 0.6001683697104454,
"step": 173
},
{
"completion_length": 973.544677734375,
"epoch": 0.5155555555555555,
"grad_norm": 0.36960232543289,
"kl": 0.21728515625,
"learning_rate": 6.0352886614501e-07,
"loss": 0.0488,
"reward": 1.1753744930028915,
"reward_std": 0.5731424987316132,
"rewards/accuracy_reward": 0.642857164144516,
"rewards/cosine_scaled_reward": 0.5325173661112785,
"step": 174
},
{
"completion_length": 1180.0045013427734,
"epoch": 0.5185185185185185,
"grad_norm": 0.3761170899059127,
"kl": 0.235595703125,
"learning_rate": 5.988934675384635e-07,
"loss": -0.0849,
"reward": 1.1396174132823944,
"reward_std": 0.6925529539585114,
"rewards/accuracy_reward": 0.611607164144516,
"rewards/cosine_scaled_reward": 0.5280102342367172,
"step": 175
},
{
"completion_length": 518.2946701049805,
"epoch": 0.5214814814814814,
"grad_norm": 0.5269257185741734,
"kl": 0.3154296875,
"learning_rate": 5.942528128583356e-07,
"loss": 0.0309,
"reward": 1.181448057293892,
"reward_std": 0.5618361011147499,
"rewards/accuracy_reward": 0.6205357387661934,
"rewards/cosine_scaled_reward": 0.5609123036265373,
"step": 176
},
{
"completion_length": 1084.8616485595703,
"epoch": 0.5244444444444445,
"grad_norm": 0.385748212876016,
"kl": 0.22021484375,
"learning_rate": 5.896074009774554e-07,
"loss": 0.0062,
"reward": 1.270677775144577,
"reward_std": 0.5810166075825691,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/cosine_scaled_reward": 0.5831777602434158,
"step": 177
},
{
"completion_length": 1041.5045013427734,
"epoch": 0.5274074074074074,
"grad_norm": 0.46881526757275843,
"kl": 0.24072265625,
"learning_rate": 5.849577312800529e-07,
"loss": 0.0565,
"reward": 1.2370425462722778,
"reward_std": 0.6740812063217163,
"rewards/accuracy_reward": 0.660714328289032,
"rewards/cosine_scaled_reward": 0.5763282403349876,
"step": 178
},
{
"completion_length": 728.0402221679688,
"epoch": 0.5303703703703704,
"grad_norm": 0.470067894180888,
"kl": 0.25830078125,
"learning_rate": 5.803043036080764e-07,
"loss": 0.0428,
"reward": 1.3547908663749695,
"reward_std": 0.5294669568538666,
"rewards/accuracy_reward": 0.723214328289032,
"rewards/cosine_scaled_reward": 0.6315765678882599,
"step": 179
},
{
"completion_length": 1242.6027221679688,
"epoch": 0.5333333333333333,
"grad_norm": 0.4591428088932079,
"kl": 0.21435546875,
"learning_rate": 5.756476182074582e-07,
"loss": 0.0083,
"reward": 1.2970826625823975,
"reward_std": 0.5159892141819,
"rewards/accuracy_reward": 0.7008928954601288,
"rewards/cosine_scaled_reward": 0.5961898565292358,
"step": 180
},
{
"completion_length": 1337.1607666015625,
"epoch": 0.5362962962962963,
"grad_norm": 0.3661728892369155,
"kl": 0.197021484375,
"learning_rate": 5.709881756743379e-07,
"loss": 0.0089,
"reward": 1.1403344869613647,
"reward_std": 0.5260699540376663,
"rewards/accuracy_reward": 0.6294643059372902,
"rewards/cosine_scaled_reward": 0.5108701921999454,
"step": 181
},
{
"completion_length": 919.8214836120605,
"epoch": 0.5392592592592592,
"grad_norm": 0.3895111477578375,
"kl": 0.243408203125,
"learning_rate": 5.663264769012486e-07,
"loss": -0.068,
"reward": 1.4501311480998993,
"reward_std": 0.4405294507741928,
"rewards/accuracy_reward": 0.7678571790456772,
"rewards/cosine_scaled_reward": 0.6822739690542221,
"step": 182
},
{
"completion_length": 1210.0982818603516,
"epoch": 0.5422222222222223,
"grad_norm": 0.3130809107538841,
"kl": 0.18115234375,
"learning_rate": 5.616630230232704e-07,
"loss": 0.0747,
"reward": 1.360123723745346,
"reward_std": 0.4805947467684746,
"rewards/accuracy_reward": 0.7321428954601288,
"rewards/cosine_scaled_reward": 0.6279808431863785,
"step": 183
},
{
"completion_length": 1449.9197082519531,
"epoch": 0.5451851851851852,
"grad_norm": 0.4137201964542162,
"kl": 0.166748046875,
"learning_rate": 5.569983153641579e-07,
"loss": 0.0794,
"reward": 1.2458688914775848,
"reward_std": 0.4530060738325119,
"rewards/accuracy_reward": 0.6741071715950966,
"rewards/cosine_scaled_reward": 0.5717617124319077,
"step": 184
},
{
"completion_length": 1297.200942993164,
"epoch": 0.5481481481481482,
"grad_norm": 0.40557122894404823,
"kl": 0.19580078125,
"learning_rate": 5.523328553824479e-07,
"loss": 0.1124,
"reward": 1.176156997680664,
"reward_std": 0.5489060133695602,
"rewards/accuracy_reward": 0.6651786118745804,
"rewards/cosine_scaled_reward": 0.5109783783555031,
"step": 185
},
{
"completion_length": 1219.9420166015625,
"epoch": 0.5511111111111111,
"grad_norm": 0.4465073109866194,
"kl": 0.194091796875,
"learning_rate": 5.476671446175522e-07,
"loss": 0.0069,
"reward": 1.1364451944828033,
"reward_std": 0.6480904817581177,
"rewards/accuracy_reward": 0.6205357387661934,
"rewards/cosine_scaled_reward": 0.5159093961119652,
"step": 186
},
{
"completion_length": 861.9553833007812,
"epoch": 0.554074074074074,
"grad_norm": 0.4575321909035033,
"kl": 0.215087890625,
"learning_rate": 5.43001684635842e-07,
"loss": 0.0884,
"reward": 1.6217327415943146,
"reward_std": 0.38701897859573364,
"rewards/accuracy_reward": 0.870535746216774,
"rewards/cosine_scaled_reward": 0.7511969804763794,
"step": 187
},
{
"completion_length": 1326.415267944336,
"epoch": 0.557037037037037,
"grad_norm": 0.39815832938233886,
"kl": 0.18359375,
"learning_rate": 5.383369769767296e-07,
"loss": 0.0396,
"reward": 1.2991815507411957,
"reward_std": 0.5862655192613602,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.5893600881099701,
"step": 188
},
{
"completion_length": 1835.3527221679688,
"epoch": 0.56,
"grad_norm": 0.31642380883582527,
"kl": 0.13916015625,
"learning_rate": 5.336735230987514e-07,
"loss": 0.079,
"reward": 1.0685685127973557,
"reward_std": 0.5348366796970367,
"rewards/accuracy_reward": 0.6160714477300644,
"rewards/cosine_scaled_reward": 0.45249706506729126,
"step": 189
},
{
"completion_length": 1354.2053985595703,
"epoch": 0.562962962962963,
"grad_norm": 0.37730924544753974,
"kl": 0.196533203125,
"learning_rate": 5.290118243256622e-07,
"loss": -0.0122,
"reward": 1.1858795583248138,
"reward_std": 0.40029022842645645,
"rewards/accuracy_reward": 0.6741071715950966,
"rewards/cosine_scaled_reward": 0.5117723196744919,
"step": 190
},
{
"completion_length": 1333.5045318603516,
"epoch": 0.5659259259259259,
"grad_norm": 0.3191966963841145,
"kl": 0.1488037109375,
"learning_rate": 5.243523817925418e-07,
"loss": 0.0125,
"reward": 1.1546119302511215,
"reward_std": 0.41448020190000534,
"rewards/accuracy_reward": 0.6383928954601288,
"rewards/cosine_scaled_reward": 0.5162190869450569,
"step": 191
},
{
"completion_length": 2106.4197387695312,
"epoch": 0.5688888888888889,
"grad_norm": 0.28781845531452005,
"kl": 0.11285400390625,
"learning_rate": 5.196956963919237e-07,
"loss": 0.1116,
"reward": 1.1330502927303314,
"reward_std": 0.55214674025774,
"rewards/accuracy_reward": 0.6517857313156128,
"rewards/cosine_scaled_reward": 0.4812645688652992,
"step": 192
},
{
"completion_length": 1593.9866943359375,
"epoch": 0.5718518518518518,
"grad_norm": 0.2628953319964047,
"kl": 0.1348876953125,
"learning_rate": 5.150422687199471e-07,
"loss": 0.0287,
"reward": 1.3014012575149536,
"reward_std": 0.47693130373954773,
"rewards/accuracy_reward": 0.7232143133878708,
"rewards/cosine_scaled_reward": 0.5781868472695351,
"step": 193
},
{
"completion_length": 1275.2366790771484,
"epoch": 0.5748148148148148,
"grad_norm": 0.2994379062128194,
"kl": 0.195068359375,
"learning_rate": 5.103925990225448e-07,
"loss": 0.02,
"reward": 1.2638274729251862,
"reward_std": 0.3140847235918045,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/cosine_scaled_reward": 0.5852559804916382,
"step": 194
},
{
"completion_length": 895.9241409301758,
"epoch": 0.5777777777777777,
"grad_norm": 0.46329426811686814,
"kl": 0.22216796875,
"learning_rate": 5.057471871416644e-07,
"loss": 0.1082,
"reward": 1.454516351222992,
"reward_std": 0.36672039702534676,
"rewards/accuracy_reward": 0.776785746216774,
"rewards/cosine_scaled_reward": 0.6777307093143463,
"step": 195
},
{
"completion_length": 2187.2054748535156,
"epoch": 0.5807407407407408,
"grad_norm": 0.2967868062982687,
"kl": 0.1002197265625,
"learning_rate": 5.011065324615364e-07,
"loss": 0.0748,
"reward": 1.204501986503601,
"reward_std": 0.5387090295553207,
"rewards/accuracy_reward": 0.7053571939468384,
"rewards/cosine_scaled_reward": 0.4991448149085045,
"step": 196
},
{
"completion_length": 1070.044677734375,
"epoch": 0.5837037037037037,
"grad_norm": 0.3698858062676789,
"kl": 0.1868896484375,
"learning_rate": 4.964711338549901e-07,
"loss": 0.0298,
"reward": 1.3357891142368317,
"reward_std": 0.4346286430954933,
"rewards/accuracy_reward": 0.7276786118745804,
"rewards/cosine_scaled_reward": 0.6081104874610901,
"step": 197
},
{
"completion_length": 1416.6027526855469,
"epoch": 0.5866666666666667,
"grad_norm": 0.30171989247018693,
"kl": 0.1600341796875,
"learning_rate": 4.918414896298229e-07,
"loss": -0.04,
"reward": 1.2265494465827942,
"reward_std": 0.45234786719083786,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/cosine_scaled_reward": 0.5479779690504074,
"step": 198
},
{
"completion_length": 868.9062881469727,
"epoch": 0.5896296296296296,
"grad_norm": 0.42429433131812017,
"kl": 0.213134765625,
"learning_rate": 4.872180974752347e-07,
"loss": -0.0113,
"reward": 1.5499212741851807,
"reward_std": 0.3130173161625862,
"rewards/accuracy_reward": 0.8080357611179352,
"rewards/cosine_scaled_reward": 0.7418854981660843,
"step": 199
},
{
"completion_length": 1253.1206359863281,
"epoch": 0.5925925925925926,
"grad_norm": 0.3397983148086048,
"kl": 0.179931640625,
"learning_rate": 4.826014544083234e-07,
"loss": 0.0647,
"reward": 1.3181427121162415,
"reward_std": 0.46541793644428253,
"rewards/accuracy_reward": 0.714285746216774,
"rewards/cosine_scaled_reward": 0.6038569808006287,
"step": 200
},
{
"completion_length": 1317.982192993164,
"epoch": 0.5955555555555555,
"grad_norm": 0.3438713760881616,
"kl": 0.1781005859375,
"learning_rate": 4.779920567206568e-07,
"loss": -0.0592,
"reward": 1.4068642556667328,
"reward_std": 0.46310556679964066,
"rewards/accuracy_reward": 0.7544643133878708,
"rewards/cosine_scaled_reward": 0.6523999273777008,
"step": 201
},
{
"completion_length": 1139.3973846435547,
"epoch": 0.5985185185185186,
"grad_norm": 0.4728872141325827,
"kl": 0.205810546875,
"learning_rate": 4.733903999249206e-07,
"loss": 0.1675,
"reward": 1.2444301843643188,
"reward_std": 0.29916173219680786,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/cosine_scaled_reward": 0.5658586621284485,
"step": 202
},
{
"completion_length": 1130.0357666015625,
"epoch": 0.6014814814814815,
"grad_norm": 0.4322501072269441,
"kl": 0.182861328125,
"learning_rate": 4.687969787016507e-07,
"loss": 0.09,
"reward": 1.3736966401338577,
"reward_std": 0.38464444130659103,
"rewards/accuracy_reward": 0.7410714626312256,
"rewards/cosine_scaled_reward": 0.632625162601471,
"step": 203
},
{
"completion_length": 1880.1072387695312,
"epoch": 0.6044444444444445,
"grad_norm": 0.25334618537099585,
"kl": 0.105926513671875,
"learning_rate": 4.642122868460542e-07,
"loss": 0.0293,
"reward": 1.2221907079219818,
"reward_std": 0.4497520886361599,
"rewards/accuracy_reward": 0.683035746216774,
"rewards/cosine_scaled_reward": 0.5391549617052078,
"step": 204
},
{
"completion_length": 1142.4330596923828,
"epoch": 0.6074074074074074,
"grad_norm": 0.41389278300030885,
"kl": 0.2017822265625,
"learning_rate": 4.596368172149268e-07,
"loss": -0.0062,
"reward": 1.0300216674804688,
"reward_std": 0.5647179707884789,
"rewards/accuracy_reward": 0.5669643133878708,
"rewards/cosine_scaled_reward": 0.46305735409259796,
"step": 205
},
{
"completion_length": 1438.7411193847656,
"epoch": 0.6103703703703703,
"grad_norm": 0.381533587277155,
"kl": 0.18603515625,
"learning_rate": 4.550710616736702e-07,
"loss": 0.0456,
"reward": 1.358052909374237,
"reward_std": 0.3910771645605564,
"rewards/accuracy_reward": 0.7410714626312256,
"rewards/cosine_scaled_reward": 0.6169814988970757,
"step": 206
},
{
"completion_length": 1475.946533203125,
"epoch": 0.6133333333333333,
"grad_norm": 0.3649152348536222,
"kl": 0.186767578125,
"learning_rate": 4.505155110434162e-07,
"loss": 0.0661,
"reward": 1.3614622950553894,
"reward_std": 0.43088656663894653,
"rewards/accuracy_reward": 0.7366071790456772,
"rewards/cosine_scaled_reward": 0.624855138361454,
"step": 207
},
{
"completion_length": 1380.1295471191406,
"epoch": 0.6162962962962963,
"grad_norm": 0.36924439281566757,
"kl": 0.17333984375,
"learning_rate": 4.459706550482638e-07,
"loss": -0.0171,
"reward": 1.2316114753484726,
"reward_std": 0.5087971612811089,
"rewards/accuracy_reward": 0.6741071790456772,
"rewards/cosine_scaled_reward": 0.5575042814016342,
"step": 208
},
{
"completion_length": 1470.2590026855469,
"epoch": 0.6192592592592593,
"grad_norm": 0.3980563161448672,
"kl": 0.168212890625,
"learning_rate": 4.4143698226263207e-07,
"loss": 0.0399,
"reward": 1.2171460092067719,
"reward_std": 0.6463766992092133,
"rewards/accuracy_reward": 0.6696428805589676,
"rewards/cosine_scaled_reward": 0.5475031360983849,
"step": 209
},
{
"completion_length": 899.7411193847656,
"epoch": 0.6222222222222222,
"grad_norm": 0.4032913849721601,
"kl": 0.2008056640625,
"learning_rate": 4.3691498005874007e-07,
"loss": 0.0048,
"reward": 1.4577341675758362,
"reward_std": 0.539386659860611,
"rewards/accuracy_reward": 0.7767857611179352,
"rewards/cosine_scaled_reward": 0.6809485107660294,
"step": 210
},
{
"completion_length": 1152.5804138183594,
"epoch": 0.6251851851851852,
"grad_norm": 0.35080155719175315,
"kl": 0.173095703125,
"learning_rate": 4.324051345542128e-07,
"loss": 0.0312,
"reward": 1.2082395255565643,
"reward_std": 0.46680425107479095,
"rewards/accuracy_reward": 0.6562500298023224,
"rewards/cosine_scaled_reward": 0.5519895032048225,
"step": 211
},
{
"completion_length": 1412.0268249511719,
"epoch": 0.6281481481481481,
"grad_norm": 0.36033713983943594,
"kl": 0.186279296875,
"learning_rate": 4.2790793055982354e-07,
"loss": 0.1063,
"reward": 1.1815235912799835,
"reward_std": 0.4442535899579525,
"rewards/accuracy_reward": 0.6562500149011612,
"rewards/cosine_scaled_reward": 0.5252735912799835,
"step": 212
},
{
"completion_length": 1060.7143249511719,
"epoch": 0.6311111111111111,
"grad_norm": 0.46894353747180056,
"kl": 0.222900390625,
"learning_rate": 4.234238515273768e-07,
"loss": 0.0439,
"reward": 1.5049341022968292,
"reward_std": 0.44284530729055405,
"rewards/accuracy_reward": 0.7991071790456772,
"rewards/cosine_scaled_reward": 0.7058268785476685,
"step": 213
},
{
"completion_length": 1348.3304290771484,
"epoch": 0.6340740740740741,
"grad_norm": 0.4055925514471472,
"kl": 0.213134765625,
"learning_rate": 4.189533794977367e-07,
"loss": 0.118,
"reward": 1.252614676952362,
"reward_std": 0.442756824195385,
"rewards/accuracy_reward": 0.6875000447034836,
"rewards/cosine_scaled_reward": 0.5651145875453949,
"step": 214
},
{
"completion_length": 1562.0982513427734,
"epoch": 0.6370370370370371,
"grad_norm": 0.3036559653599386,
"kl": 0.1678466796875,
"learning_rate": 4.14496995049007e-07,
"loss": 0.0349,
"reward": 1.2328214347362518,
"reward_std": 0.5444767251610756,
"rewards/accuracy_reward": 0.6741071790456772,
"rewards/cosine_scaled_reward": 0.5587142258882523,
"step": 215
},
{
"completion_length": 1202.1116638183594,
"epoch": 0.64,
"grad_norm": 0.37347997620862683,
"kl": 0.21240234375,
"learning_rate": 4.100551772448697e-07,
"loss": 0.1321,
"reward": 1.2631460428237915,
"reward_std": 0.43043725937604904,
"rewards/accuracy_reward": 0.6919643133878708,
"rewards/cosine_scaled_reward": 0.5711817443370819,
"step": 216
},
{
"completion_length": 1593.05810546875,
"epoch": 0.642962962962963,
"grad_norm": 0.3224052157606133,
"kl": 0.183837890625,
"learning_rate": 4.056284035830846e-07,
"loss": -0.0014,
"reward": 1.0634922683238983,
"reward_std": 0.5580763593316078,
"rewards/accuracy_reward": 0.6160714626312256,
"rewards/cosine_scaled_reward": 0.44742076098918915,
"step": 217
},
{
"completion_length": 1347.2098693847656,
"epoch": 0.6459259259259259,
"grad_norm": 0.37978234862418775,
"kl": 0.19873046875,
"learning_rate": 4.012171499441578e-07,
"loss": -0.0295,
"reward": 1.2675736546516418,
"reward_std": 0.4535221755504608,
"rewards/accuracy_reward": 0.6964286118745804,
"rewards/cosine_scaled_reward": 0.5711449980735779,
"step": 218
},
{
"completion_length": 978.2991638183594,
"epoch": 0.6488888888888888,
"grad_norm": 0.3878234235322433,
"kl": 0.216796875,
"learning_rate": 3.968218905401853e-07,
"loss": 0.068,
"reward": 1.36513289809227,
"reward_std": 0.6089888289570808,
"rewards/accuracy_reward": 0.7276786267757416,
"rewards/cosine_scaled_reward": 0.6374543011188507,
"step": 219
},
{
"completion_length": 1114.9018249511719,
"epoch": 0.6518518518518519,
"grad_norm": 0.3757339213906905,
"kl": 0.216064453125,
"learning_rate": 3.924430978638742e-07,
"loss": 0.0546,
"reward": 1.0311194062232971,
"reward_std": 0.5321889817714691,
"rewards/accuracy_reward": 0.5625000298023224,
"rewards/cosine_scaled_reward": 0.46861938387155533,
"step": 220
},
{
"completion_length": 1431.6875915527344,
"epoch": 0.6548148148148148,
"grad_norm": 0.38591627549365615,
"kl": 0.21826171875,
"learning_rate": 3.8808124263774955e-07,
"loss": 0.0098,
"reward": 1.2712106704711914,
"reward_std": 0.5642440319061279,
"rewards/accuracy_reward": 0.6875000447034836,
"rewards/cosine_scaled_reward": 0.583710677921772,
"step": 221
},
{
"completion_length": 1437.6116943359375,
"epoch": 0.6577777777777778,
"grad_norm": 0.3628737795234154,
"kl": 0.210205078125,
"learning_rate": 3.8373679376355195e-07,
"loss": 0.006,
"reward": 0.9208376854658127,
"reward_std": 0.5681522116065025,
"rewards/accuracy_reward": 0.5357143059372902,
"rewards/cosine_scaled_reward": 0.3851233683526516,
"step": 222
},
{
"completion_length": 1019.7589416503906,
"epoch": 0.6607407407407407,
"grad_norm": 0.4796974552820394,
"kl": 0.23583984375,
"learning_rate": 3.794102182718294e-07,
"loss": -0.0979,
"reward": 1.2191343009471893,
"reward_std": 0.509204089641571,
"rewards/accuracy_reward": 0.651785746216774,
"rewards/cosine_scaled_reward": 0.5673485770821571,
"step": 223
},
{
"completion_length": 1058.0357360839844,
"epoch": 0.6637037037037037,
"grad_norm": 0.46320248751547094,
"kl": 0.266357421875,
"learning_rate": 3.751019812717322e-07,
"loss": -0.0134,
"reward": 1.1183428168296814,
"reward_std": 0.6392548233270645,
"rewards/accuracy_reward": 0.6071428805589676,
"rewards/cosine_scaled_reward": 0.5111999437212944,
"step": 224
},
{
"completion_length": 1300.5268249511719,
"epoch": 0.6666666666666666,
"grad_norm": 0.4371531247021142,
"kl": 0.274169921875,
"learning_rate": 3.708125459010134e-07,
"loss": -0.1225,
"reward": 1.1466220319271088,
"reward_std": 0.5585737600922585,
"rewards/accuracy_reward": 0.611607164144516,
"rewards/cosine_scaled_reward": 0.5350148379802704,
"step": 225
},
{
"completion_length": 1168.6652374267578,
"epoch": 0.6696296296296296,
"grad_norm": 0.5078015018739912,
"kl": 0.24072265625,
"learning_rate": 3.6654237327624003e-07,
"loss": 0.1142,
"reward": 1.0346617102622986,
"reward_std": 0.5203389897942543,
"rewards/accuracy_reward": 0.580357164144516,
"rewards/cosine_scaled_reward": 0.454304538667202,
"step": 226
},
{
"completion_length": 1110.995590209961,
"epoch": 0.6725925925925926,
"grad_norm": 0.4665048248523059,
"kl": 0.264404296875,
"learning_rate": 3.622919224432248e-07,
"loss": -0.074,
"reward": 1.2056891322135925,
"reward_std": 0.6480113118886948,
"rewards/accuracy_reward": 0.642857164144516,
"rewards/cosine_scaled_reward": 0.5628319680690765,
"step": 227
},
{
"completion_length": 1133.5313186645508,
"epoch": 0.6755555555555556,
"grad_norm": 0.3798907809838722,
"kl": 0.2587890625,
"learning_rate": 3.580616503276772e-07,
"loss": 0.0722,
"reward": 1.1472938358783722,
"reward_std": 0.6741429939866066,
"rewards/accuracy_reward": 0.620535746216774,
"rewards/cosine_scaled_reward": 0.5267581045627594,
"step": 228
},
{
"completion_length": 1382.5670318603516,
"epoch": 0.6785185185185185,
"grad_norm": 0.4236817109835327,
"kl": 0.204345703125,
"learning_rate": 3.5385201168608303e-07,
"loss": 0.0165,
"reward": 1.1142818331718445,
"reward_std": 0.49922633171081543,
"rewards/accuracy_reward": 0.6383928954601288,
"rewards/cosine_scaled_reward": 0.4758888818323612,
"step": 229
},
{
"completion_length": 1587.8170318603516,
"epoch": 0.6814814814814815,
"grad_norm": 0.3672380211269815,
"kl": 0.1927490234375,
"learning_rate": 3.4966345905681984e-07,
"loss": 0.0907,
"reward": 1.0480027794837952,
"reward_std": 0.713165745139122,
"rewards/accuracy_reward": 0.6116071790456772,
"rewards/cosine_scaled_reward": 0.4363955929875374,
"step": 230
},
{
"completion_length": 1256.8348846435547,
"epoch": 0.6844444444444444,
"grad_norm": 0.33221266123133913,
"kl": 0.22412109375,
"learning_rate": 3.4549644271150723e-07,
"loss": -0.0146,
"reward": 1.2618545591831207,
"reward_std": 0.4967653974890709,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/cosine_scaled_reward": 0.5743545740842819,
"step": 231
},
{
"completion_length": 1367.9197235107422,
"epoch": 0.6874074074074074,
"grad_norm": 0.387482879229943,
"kl": 0.236572265625,
"learning_rate": 3.413514106066026e-07,
"loss": -0.0083,
"reward": 1.1527521908283234,
"reward_std": 0.5814446583390236,
"rewards/accuracy_reward": 0.6339286118745804,
"rewards/cosine_scaled_reward": 0.518823616206646,
"step": 232
},
{
"completion_length": 1276.5402221679688,
"epoch": 0.6903703703703704,
"grad_norm": 0.46231425089597195,
"kl": 0.236083984375,
"learning_rate": 3.3722880833524704e-07,
"loss": -0.0412,
"reward": 1.0933943092823029,
"reward_std": 0.6421699896454811,
"rewards/accuracy_reward": 0.611607164144516,
"rewards/cosine_scaled_reward": 0.48178714513778687,
"step": 233
},
{
"completion_length": 721.794677734375,
"epoch": 0.6933333333333334,
"grad_norm": 0.5338257753145169,
"kl": 0.31005859375,
"learning_rate": 3.3312907907936097e-07,
"loss": 0.0071,
"reward": 1.3637142181396484,
"reward_std": 0.46142444014549255,
"rewards/accuracy_reward": 0.714285746216774,
"rewards/cosine_scaled_reward": 0.6494284868240356,
"step": 234
},
{
"completion_length": 1033.7678909301758,
"epoch": 0.6962962962962963,
"grad_norm": 0.43977586309359473,
"kl": 0.251220703125,
"learning_rate": 3.2905266356200506e-07,
"loss": 0.0159,
"reward": 1.279816746711731,
"reward_std": 0.5453010722994804,
"rewards/accuracy_reward": 0.6919643133878708,
"rewards/cosine_scaled_reward": 0.587852418422699,
"step": 235
},
{
"completion_length": 1518.8437805175781,
"epoch": 0.6992592592592592,
"grad_norm": 0.40240053570452927,
"kl": 0.21875,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0586,
"reward": 1.1353959143161774,
"reward_std": 0.4257539436221123,
"rewards/accuracy_reward": 0.6339286118745804,
"rewards/cosine_scaled_reward": 0.5014673247933388,
"step": 236
},
{
"completion_length": 1218.2813110351562,
"epoch": 0.7022222222222222,
"grad_norm": 0.42429461890467546,
"kl": 0.242919921875,
"learning_rate": 3.2097152405681904e-07,
"loss": -0.0396,
"reward": 1.2611887753009796,
"reward_std": 0.6024208590388298,
"rewards/accuracy_reward": 0.7008928805589676,
"rewards/cosine_scaled_reward": 0.560295857489109,
"step": 237
},
{
"completion_length": 1438.3527221679688,
"epoch": 0.7051851851851851,
"grad_norm": 0.3284405554836145,
"kl": 0.1895751953125,
"learning_rate": 3.1696766879575354e-07,
"loss": 0.0122,
"reward": 1.0729888081550598,
"reward_std": 0.41248803213238716,
"rewards/accuracy_reward": 0.6071428954601288,
"rewards/cosine_scaled_reward": 0.4658459797501564,
"step": 238
},
{
"completion_length": 1612.4732971191406,
"epoch": 0.7081481481481482,
"grad_norm": 0.3092443961041311,
"kl": 0.189453125,
"learning_rate": 3.1298886463335857e-07,
"loss": 0.0405,
"reward": 1.1404339224100113,
"reward_std": 0.49810411036014557,
"rewards/accuracy_reward": 0.6383928805589676,
"rewards/cosine_scaled_reward": 0.5020410493016243,
"step": 239
},
{
"completion_length": 1501.8438110351562,
"epoch": 0.7111111111111111,
"grad_norm": 0.42545118282377176,
"kl": 0.20068359375,
"learning_rate": 3.090355392931827e-07,
"loss": 0.1038,
"reward": 1.1629545539617538,
"reward_std": 0.49815448373556137,
"rewards/accuracy_reward": 0.660714328289032,
"rewards/cosine_scaled_reward": 0.5022402182221413,
"step": 240
},
{
"completion_length": 1084.200942993164,
"epoch": 0.7140740740740741,
"grad_norm": 0.5829087708630915,
"kl": 0.287109375,
"learning_rate": 3.051081177597876e-07,
"loss": 0.0288,
"reward": 1.4094779789447784,
"reward_std": 0.4218045175075531,
"rewards/accuracy_reward": 0.7633928954601288,
"rewards/cosine_scaled_reward": 0.6460850834846497,
"step": 241
},
{
"completion_length": 1364.0044860839844,
"epoch": 0.717037037037037,
"grad_norm": 0.4157551911374349,
"kl": 0.239990234375,
"learning_rate": 3.012070222330629e-07,
"loss": 0.0491,
"reward": 1.2944897413253784,
"reward_std": 0.4317842833697796,
"rewards/accuracy_reward": 0.7053571939468384,
"rewards/cosine_scaled_reward": 0.5891326069831848,
"step": 242
},
{
"completion_length": 1542.388427734375,
"epoch": 0.72,
"grad_norm": 0.5376555808419441,
"kl": 0.261962890625,
"learning_rate": 2.97332672082837e-07,
"loss": 0.0058,
"reward": 1.268993079662323,
"reward_std": 0.5607812628149986,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/cosine_scaled_reward": 0.5725645199418068,
"step": 243
},
{
"completion_length": 1342.5402221679688,
"epoch": 0.7229629629629629,
"grad_norm": 0.4074657442549319,
"kl": 0.228759765625,
"learning_rate": 2.934854838037978e-07,
"loss": -0.0441,
"reward": 1.339747965335846,
"reward_std": 0.49331291019916534,
"rewards/accuracy_reward": 0.714285746216774,
"rewards/cosine_scaled_reward": 0.625462144613266,
"step": 244
},
{
"completion_length": 1365.7366638183594,
"epoch": 0.725925925925926,
"grad_norm": 0.40832180400823287,
"kl": 0.24951171875,
"learning_rate": 2.8966587097071683e-07,
"loss": -0.0179,
"reward": 1.3763412535190582,
"reward_std": 0.45648277550935745,
"rewards/accuracy_reward": 0.7589285969734192,
"rewards/cosine_scaled_reward": 0.617412656545639,
"step": 245
},
{
"completion_length": 876.0937957763672,
"epoch": 0.7288888888888889,
"grad_norm": 0.6321515347107881,
"kl": 0.3095703125,
"learning_rate": 2.8587424419399055e-07,
"loss": 0.0573,
"reward": 1.5959438979625702,
"reward_std": 0.44527300633490086,
"rewards/accuracy_reward": 0.8437500149011612,
"rewards/cosine_scaled_reward": 0.7521938383579254,
"step": 246
},
{
"completion_length": 1092.5714950561523,
"epoch": 0.7318518518518519,
"grad_norm": 0.4067546608657204,
"kl": 0.218017578125,
"learning_rate": 2.821110110755004e-07,
"loss": -0.0326,
"reward": 1.2758931815624237,
"reward_std": 0.5539436712861061,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.5660717189311981,
"step": 247
},
{
"completion_length": 829.7857513427734,
"epoch": 0.7348148148148148,
"grad_norm": 0.5217243910881588,
"kl": 0.3056640625,
"learning_rate": 2.783765761647934e-07,
"loss": 0.0326,
"reward": 1.2323424369096756,
"reward_std": 0.5618766322731972,
"rewards/accuracy_reward": 0.6607143133878708,
"rewards/cosine_scaled_reward": 0.5716281086206436,
"step": 248
},
{
"completion_length": 1365.2009735107422,
"epoch": 0.7377777777777778,
"grad_norm": 0.4853542957910564,
"kl": 0.29052734375,
"learning_rate": 2.746713409155951e-07,
"loss": 0.023,
"reward": 1.2717522531747818,
"reward_std": 0.581157274544239,
"rewards/accuracy_reward": 0.6964286118745804,
"rewards/cosine_scaled_reward": 0.575323686003685,
"step": 249
},
{
"completion_length": 1353.825942993164,
"epoch": 0.7407407407407407,
"grad_norm": 0.5163739772137682,
"kl": 0.26995849609375,
"learning_rate": 2.709957036426512e-07,
"loss": 0.0271,
"reward": 1.3092380166053772,
"reward_std": 0.5593340247869492,
"rewards/accuracy_reward": 0.7053571790456772,
"rewards/cosine_scaled_reward": 0.6038808077573776,
"step": 250
},
{
"completion_length": 1508.0982971191406,
"epoch": 0.7437037037037038,
"grad_norm": 0.4173902404577724,
"kl": 0.208984375,
"learning_rate": 2.6735005947890986e-07,
"loss": 0.0223,
"reward": 1.2288760542869568,
"reward_std": 0.7006416544318199,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/cosine_scaled_reward": 0.5413760542869568,
"step": 251
},
{
"completion_length": 1491.982177734375,
"epoch": 0.7466666666666667,
"grad_norm": 0.41265414833354097,
"kl": 0.277099609375,
"learning_rate": 2.6373480033304397e-07,
"loss": -0.0232,
"reward": 1.0731790214776993,
"reward_std": 0.5543450340628624,
"rewards/accuracy_reward": 0.5982142984867096,
"rewards/cosine_scaled_reward": 0.4749646857380867,
"step": 252
},
{
"completion_length": 1239.7545623779297,
"epoch": 0.7496296296296296,
"grad_norm": 0.4917280354457927,
"kl": 0.2900390625,
"learning_rate": 2.6015031484732103e-07,
"loss": 0.0201,
"reward": 1.260190635919571,
"reward_std": 0.5851811021566391,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/cosine_scaled_reward": 0.5726906284689903,
"step": 253
},
{
"completion_length": 1361.5625915527344,
"epoch": 0.7525925925925926,
"grad_norm": 0.3469414349194081,
"kl": 0.2412109375,
"learning_rate": 2.565969883558236e-07,
"loss": -0.0921,
"reward": 1.24320450425148,
"reward_std": 0.47326986491680145,
"rewards/accuracy_reward": 0.6830357611179352,
"rewards/cosine_scaled_reward": 0.5601687207818031,
"step": 254
},
{
"completion_length": 1658.2188110351562,
"epoch": 0.7555555555555555,
"grad_norm": 0.3001068094750129,
"kl": 0.2127685546875,
"learning_rate": 2.5307520284302606e-07,
"loss": 0.0615,
"reward": 1.1176211386919022,
"reward_std": 0.586233526468277,
"rewards/accuracy_reward": 0.6383928805589676,
"rewards/cosine_scaled_reward": 0.47922827303409576,
"step": 255
},
{
"completion_length": 1364.2902526855469,
"epoch": 0.7585185185185185,
"grad_norm": 0.4237148024737694,
"kl": 0.27880859375,
"learning_rate": 2.495853369027309e-07,
"loss": 0.0823,
"reward": 1.1211449354887009,
"reward_std": 0.6094193160533905,
"rewards/accuracy_reward": 0.6205357313156128,
"rewards/cosine_scaled_reward": 0.5006091818213463,
"step": 256
},
{
"completion_length": 1790.7724609375,
"epoch": 0.7614814814814815,
"grad_norm": 0.259378407584805,
"kl": 0.181396484375,
"learning_rate": 2.4612776569736984e-07,
"loss": 0.0149,
"reward": 1.3074856102466583,
"reward_std": 0.68328557908535,
"rewards/accuracy_reward": 0.7276786118745804,
"rewards/cosine_scaled_reward": 0.5798069983720779,
"step": 257
},
{
"completion_length": 1608.384017944336,
"epoch": 0.7644444444444445,
"grad_norm": 0.453218761617686,
"kl": 0.230712890625,
"learning_rate": 2.4270286091767335e-07,
"loss": 0.1279,
"reward": 1.222515344619751,
"reward_std": 0.3630467727780342,
"rewards/accuracy_reward": 0.683035746216774,
"rewards/cosine_scaled_reward": 0.539479598402977,
"step": 258
},
{
"completion_length": 685.4464645385742,
"epoch": 0.7674074074074074,
"grad_norm": 0.578456215741116,
"kl": 0.3359375,
"learning_rate": 2.39310990742714e-07,
"loss": 0.1645,
"reward": 1.4390722215175629,
"reward_std": 0.5096501708030701,
"rewards/accuracy_reward": 0.7678571939468384,
"rewards/cosine_scaled_reward": 0.6712149977684021,
"step": 259
},
{
"completion_length": 1224.075942993164,
"epoch": 0.7703703703703704,
"grad_norm": 0.400946501548652,
"kl": 0.25,
"learning_rate": 2.3595251980032673e-07,
"loss": 0.0252,
"reward": 1.4241975545883179,
"reward_std": 0.5224835053086281,
"rewards/accuracy_reward": 0.767857164144516,
"rewards/cosine_scaled_reward": 0.6563403755426407,
"step": 260
},
{
"completion_length": 1514.6875610351562,
"epoch": 0.7733333333333333,
"grad_norm": 0.32107696314591083,
"kl": 0.21142578125,
"learning_rate": 2.3262780912791183e-07,
"loss": -0.0844,
"reward": 1.2489876449108124,
"reward_std": 0.47115904837846756,
"rewards/accuracy_reward": 0.683035746216774,
"rewards/cosine_scaled_reward": 0.5659519508481026,
"step": 261
},
{
"completion_length": 1049.1741638183594,
"epoch": 0.7762962962962963,
"grad_norm": 0.4983381962623269,
"kl": 0.283203125,
"learning_rate": 2.2933721613362188e-07,
"loss": -0.0656,
"reward": 1.3514663726091385,
"reward_std": 0.5631691515445709,
"rewards/accuracy_reward": 0.723214328289032,
"rewards/cosine_scaled_reward": 0.6282520294189453,
"step": 262
},
{
"completion_length": 1280.808090209961,
"epoch": 0.7792592592592592,
"grad_norm": 0.5370494902972633,
"kl": 0.259521484375,
"learning_rate": 2.2608109455794197e-07,
"loss": 0.0951,
"reward": 1.3311371505260468,
"reward_std": 0.3910303898155689,
"rewards/accuracy_reward": 0.7232143133878708,
"rewards/cosine_scaled_reward": 0.6079228222370148,
"step": 263
},
{
"completion_length": 1652.7813262939453,
"epoch": 0.7822222222222223,
"grad_norm": 0.5517506308898386,
"kl": 0.242431640625,
"learning_rate": 2.2285979443566093e-07,
"loss": -0.04,
"reward": 1.1543093919754028,
"reward_std": 0.543508306145668,
"rewards/accuracy_reward": 0.6339285969734192,
"rewards/cosine_scaled_reward": 0.5203807801008224,
"step": 264
},
{
"completion_length": 1420.4197235107422,
"epoch": 0.7851851851851852,
"grad_norm": 0.44231688430689814,
"kl": 0.17041015625,
"learning_rate": 2.196736620582429e-07,
"loss": -0.0778,
"reward": 1.1269243955612183,
"reward_std": 0.5865212008357048,
"rewards/accuracy_reward": 0.6339286118745804,
"rewards/cosine_scaled_reward": 0.49299580603837967,
"step": 265
},
{
"completion_length": 1418.6116333007812,
"epoch": 0.7881481481481482,
"grad_norm": 0.4078519749270961,
"kl": 0.208984375,
"learning_rate": 2.1652303993660146e-07,
"loss": -0.0418,
"reward": 1.27777498960495,
"reward_std": 0.5090883374214172,
"rewards/accuracy_reward": 0.7053571790456772,
"rewards/cosine_scaled_reward": 0.5724178552627563,
"step": 266
},
{
"completion_length": 1807.9688720703125,
"epoch": 0.7911111111111111,
"grad_norm": 0.3612322112017449,
"kl": 0.204345703125,
"learning_rate": 2.1340826676427826e-07,
"loss": 0.0524,
"reward": 0.9595437347888947,
"reward_std": 0.5931633710861206,
"rewards/accuracy_reward": 0.5714285969734192,
"rewards/cosine_scaled_reward": 0.38811516016721725,
"step": 267
},
{
"completion_length": 1627.9599304199219,
"epoch": 0.794074074074074,
"grad_norm": 0.46615314939080527,
"kl": 0.2088623046875,
"learning_rate": 2.103296773810344e-07,
"loss": -0.0877,
"reward": 1.2892495691776276,
"reward_std": 0.5664657056331635,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.5794281512498856,
"step": 268
},
{
"completion_length": 999.0357666015625,
"epoch": 0.797037037037037,
"grad_norm": 0.4273528196130989,
"kl": 0.245361328125,
"learning_rate": 2.0728760273685435e-07,
"loss": -0.0457,
"reward": 1.2847952246665955,
"reward_std": 0.5983417630195618,
"rewards/accuracy_reward": 0.6919643133878708,
"rewards/cosine_scaled_reward": 0.5928309559822083,
"step": 269
},
{
"completion_length": 1342.5179290771484,
"epoch": 0.8,
"grad_norm": 0.5157694271512652,
"kl": 0.23779296875,
"learning_rate": 2.0428236985636878e-07,
"loss": -0.0943,
"reward": 1.285570204257965,
"reward_std": 0.6152675747871399,
"rewards/accuracy_reward": 0.7008928805589676,
"rewards/cosine_scaled_reward": 0.5846773013472557,
"step": 270
},
{
"completion_length": 1232.8393859863281,
"epoch": 0.802962962962963,
"grad_norm": 0.4444373741140913,
"kl": 0.234375,
"learning_rate": 2.0131430180369957e-07,
"loss": -0.0321,
"reward": 1.4194039404392242,
"reward_std": 0.510127916932106,
"rewards/accuracy_reward": 0.7633928805589676,
"rewards/cosine_scaled_reward": 0.6560111045837402,
"step": 271
},
{
"completion_length": 1367.5134582519531,
"epoch": 0.8059259259259259,
"grad_norm": 0.4184360736447391,
"kl": 0.247802734375,
"learning_rate": 1.9838371764772992e-07,
"loss": 0.0103,
"reward": 1.1604254990816116,
"reward_std": 0.6053372994065285,
"rewards/accuracy_reward": 0.6339285969734192,
"rewards/cosine_scaled_reward": 0.5264968723058701,
"step": 272
},
{
"completion_length": 1476.8304290771484,
"epoch": 0.8088888888888889,
"grad_norm": 0.5070722115121915,
"kl": 0.25146484375,
"learning_rate": 1.954909324278041e-07,
"loss": 0.0343,
"reward": 1.1654971539974213,
"reward_std": 0.5020733252167702,
"rewards/accuracy_reward": 0.651785746216774,
"rewards/cosine_scaled_reward": 0.5137114599347115,
"step": 273
},
{
"completion_length": 1203.1920166015625,
"epoch": 0.8118518518518518,
"grad_norm": 0.4609308538909395,
"kl": 0.251220703125,
"learning_rate": 1.9263625711986092e-07,
"loss": 0.0711,
"reward": 1.279031217098236,
"reward_std": 0.5459231436252594,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/cosine_scaled_reward": 0.5826026350259781,
"step": 274
},
{
"completion_length": 1180.1205749511719,
"epoch": 0.8148148148148148,
"grad_norm": 1.5003547281483087,
"kl": 0.27490234375,
"learning_rate": 1.8981999860300385e-07,
"loss": 0.1132,
"reward": 1.3819984197616577,
"reward_std": 0.5490370243787766,
"rewards/accuracy_reward": 0.7500000298023224,
"rewards/cosine_scaled_reward": 0.6319983601570129,
"step": 275
},
{
"completion_length": 1468.6295471191406,
"epoch": 0.8177777777777778,
"grad_norm": 0.3926677811915502,
"kl": 0.24267578125,
"learning_rate": 1.8704245962651026e-07,
"loss": 0.049,
"reward": 1.3316981196403503,
"reward_std": 0.5511728748679161,
"rewards/accuracy_reward": 0.7232142984867096,
"rewards/cosine_scaled_reward": 0.608483761548996,
"step": 276
},
{
"completion_length": 1194.0089721679688,
"epoch": 0.8207407407407408,
"grad_norm": 0.5852846616386558,
"kl": 0.3037109375,
"learning_rate": 1.8430393877728745e-07,
"loss": -0.079,
"reward": 1.3836183547973633,
"reward_std": 0.5606663823127747,
"rewards/accuracy_reward": 0.7500000298023224,
"rewards/cosine_scaled_reward": 0.6336182802915573,
"step": 277
},
{
"completion_length": 1510.401870727539,
"epoch": 0.8237037037037037,
"grad_norm": 0.3342562243587002,
"kl": 0.197509765625,
"learning_rate": 1.8160473044777263e-07,
"loss": 0.1086,
"reward": 1.1657965332269669,
"reward_std": 0.5177839547395706,
"rewards/accuracy_reward": 0.651785746216774,
"rewards/cosine_scaled_reward": 0.5140108019113541,
"step": 278
},
{
"completion_length": 1184.7054138183594,
"epoch": 0.8266666666666667,
"grad_norm": 0.45102568845817925,
"kl": 0.261474609375,
"learning_rate": 1.789451248042867e-07,
"loss": -0.0701,
"reward": 1.5106743574142456,
"reward_std": 0.45990853384137154,
"rewards/accuracy_reward": 0.8035714626312256,
"rewards/cosine_scaled_reward": 0.7071028649806976,
"step": 279
},
{
"completion_length": 1492.3527526855469,
"epoch": 0.8296296296296296,
"grad_norm": 0.37307331027960394,
"kl": 0.17333984375,
"learning_rate": 1.763254077558411e-07,
"loss": 0.0009,
"reward": 1.390456646680832,
"reward_std": 0.5531467348337173,
"rewards/accuracy_reward": 0.7589286118745804,
"rewards/cosine_scaled_reward": 0.6315280720591545,
"step": 280
},
{
"completion_length": 773.513427734375,
"epoch": 0.8325925925925926,
"grad_norm": 0.5361053249805026,
"kl": 0.29345703125,
"learning_rate": 1.7374586092340194e-07,
"loss": 0.0238,
"reward": 1.5248645544052124,
"reward_std": 0.502290703356266,
"rewards/accuracy_reward": 0.8080357760190964,
"rewards/cosine_scaled_reward": 0.7168288230895996,
"step": 281
},
{
"completion_length": 1233.0045318603516,
"epoch": 0.8355555555555556,
"grad_norm": 0.4173056786421646,
"kl": 0.243408203125,
"learning_rate": 1.712067616096159e-07,
"loss": -0.037,
"reward": 1.3660497963428497,
"reward_std": 0.6344530582427979,
"rewards/accuracy_reward": 0.7410714626312256,
"rewards/cosine_scaled_reward": 0.6249783635139465,
"step": 282
},
{
"completion_length": 1620.200927734375,
"epoch": 0.8385185185185186,
"grad_norm": 0.39016732045765984,
"kl": 0.1807861328125,
"learning_rate": 1.6870838276900018e-07,
"loss": 0.0001,
"reward": 1.1307050585746765,
"reward_std": 0.5222566425800323,
"rewards/accuracy_reward": 0.6428571790456772,
"rewards/cosine_scaled_reward": 0.4878478869795799,
"step": 283
},
{
"completion_length": 1542.8348693847656,
"epoch": 0.8414814814814815,
"grad_norm": 0.4725307883078238,
"kl": 0.22265625,
"learning_rate": 1.6625099297859945e-07,
"loss": 0.0879,
"reward": 1.2903397679328918,
"reward_std": 0.5241145640611649,
"rewards/accuracy_reward": 0.714285746216774,
"rewards/cosine_scaled_reward": 0.5760539919137955,
"step": 284
},
{
"completion_length": 1025.8527221679688,
"epoch": 0.8444444444444444,
"grad_norm": 0.5196454376003679,
"kl": 0.26513671875,
"learning_rate": 1.638348564091142e-07,
"loss": 0.1132,
"reward": 1.255773812532425,
"reward_std": 0.6003080010414124,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/cosine_scaled_reward": 0.577202320098877,
"step": 285
},
{
"completion_length": 1298.3348846435547,
"epoch": 0.8474074074074074,
"grad_norm": 0.5413412582194498,
"kl": 0.248291015625,
"learning_rate": 1.6146023279650146e-07,
"loss": -0.0199,
"reward": 1.0494957864284515,
"reward_std": 0.5134128257632256,
"rewards/accuracy_reward": 0.5758928880095482,
"rewards/cosine_scaled_reward": 0.47360285371541977,
"step": 286
},
{
"completion_length": 1183.8795013427734,
"epoch": 0.8503703703703703,
"grad_norm": 0.43187548636698553,
"kl": 0.238037109375,
"learning_rate": 1.5912737741405364e-07,
"loss": 0.0592,
"reward": 1.2715223133563995,
"reward_std": 0.628858245909214,
"rewards/accuracy_reward": 0.6830357313156128,
"rewards/cosine_scaled_reward": 0.5884865522384644,
"step": 287
},
{
"completion_length": 1753.2188415527344,
"epoch": 0.8533333333333334,
"grad_norm": 0.4428533634005759,
"kl": 0.190185546875,
"learning_rate": 1.5683654104495627e-07,
"loss": 0.0715,
"reward": 1.211821123957634,
"reward_std": 0.4482051581144333,
"rewards/accuracy_reward": 0.6696428954601288,
"rewards/cosine_scaled_reward": 0.5421782657504082,
"step": 288
},
{
"completion_length": 1441.8572082519531,
"epoch": 0.8562962962962963,
"grad_norm": 0.4890440610981784,
"kl": 0.2373046875,
"learning_rate": 1.5458796995532915e-07,
"loss": 0.0065,
"reward": 1.5017207860946655,
"reward_std": 0.47509852796792984,
"rewards/accuracy_reward": 0.816964328289032,
"rewards/cosine_scaled_reward": 0.6847565025091171,
"step": 289
},
{
"completion_length": 1205.3304290771484,
"epoch": 0.8592592592592593,
"grad_norm": 0.47871337934810404,
"kl": 0.24365234375,
"learning_rate": 1.5238190586775145e-07,
"loss": 0.0997,
"reward": 1.318919599056244,
"reward_std": 0.5144237354397774,
"rewards/accuracy_reward": 0.7232143133878708,
"rewards/cosine_scaled_reward": 0.5957053601741791,
"step": 290
},
{
"completion_length": 1630.5804138183594,
"epoch": 0.8622222222222222,
"grad_norm": 0.32041894818924843,
"kl": 0.1953125,
"learning_rate": 1.50218585935278e-07,
"loss": -0.038,
"reward": 1.3746657818555832,
"reward_std": 0.4751938730478287,
"rewards/accuracy_reward": 0.754464328289032,
"rewards/cosine_scaled_reward": 0.6202014237642288,
"step": 291
},
{
"completion_length": 930.7277297973633,
"epoch": 0.8651851851851852,
"grad_norm": 0.474044463723305,
"kl": 0.260986328125,
"learning_rate": 1.4809824271594384e-07,
"loss": 0.0273,
"reward": 1.3926972150802612,
"reward_std": 0.4362456612288952,
"rewards/accuracy_reward": 0.7500000298023224,
"rewards/cosine_scaled_reward": 0.6426971927285194,
"step": 292
},
{
"completion_length": 1605.7902221679688,
"epoch": 0.8681481481481481,
"grad_norm": 0.3263847223884831,
"kl": 0.175048828125,
"learning_rate": 1.4602110414776475e-07,
"loss": -0.0205,
"reward": 1.2047614008188248,
"reward_std": 0.554048590362072,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/cosine_scaled_reward": 0.5083328485488892,
"step": 293
},
{
"completion_length": 1610.9375915527344,
"epoch": 0.8711111111111111,
"grad_norm": 0.3401666470383707,
"kl": 0.1712646484375,
"learning_rate": 1.4398739352423406e-07,
"loss": 0.0331,
"reward": 1.011045515537262,
"reward_std": 0.5167748332023621,
"rewards/accuracy_reward": 0.5937500149011612,
"rewards/cosine_scaled_reward": 0.41729553043842316,
"step": 294
},
{
"completion_length": 1926.4420471191406,
"epoch": 0.8740740740740741,
"grad_norm": 0.45709540487996353,
"kl": 0.1748046875,
"learning_rate": 1.419973294703174e-07,
"loss": 0.0703,
"reward": 1.0281963050365448,
"reward_std": 0.47293490171432495,
"rewards/accuracy_reward": 0.6026785895228386,
"rewards/cosine_scaled_reward": 0.42551764845848083,
"step": 295
},
{
"completion_length": 1782.7411499023438,
"epoch": 0.8770370370370371,
"grad_norm": 0.33218294380687946,
"kl": 0.2039794921875,
"learning_rate": 1.400511259189518e-07,
"loss": 0.0501,
"reward": 1.1762474179267883,
"reward_std": 0.37930237501859665,
"rewards/accuracy_reward": 0.6696428954601288,
"rewards/cosine_scaled_reward": 0.5066045522689819,
"step": 296
},
{
"completion_length": 1053.7679023742676,
"epoch": 0.88,
"grad_norm": 0.41588133172026326,
"kl": 0.24658203125,
"learning_rate": 1.3814899208804677e-07,
"loss": 0.0503,
"reward": 1.2276224493980408,
"reward_std": 0.5703399553894997,
"rewards/accuracy_reward": 0.6785714477300644,
"rewards/cosine_scaled_reward": 0.5490510165691376,
"step": 297
},
{
"completion_length": 1754.9733276367188,
"epoch": 0.882962962962963,
"grad_norm": 0.4886881244904496,
"kl": 0.2197265625,
"learning_rate": 1.3629113245799361e-07,
"loss": -0.0222,
"reward": 1.1690296977758408,
"reward_std": 0.5029887109994888,
"rewards/accuracy_reward": 0.6562500298023224,
"rewards/cosine_scaled_reward": 0.5127796456217766,
"step": 298
},
{
"completion_length": 1098.4911346435547,
"epoch": 0.8859259259259259,
"grad_norm": 0.4020920520908739,
"kl": 0.2568359375,
"learning_rate": 1.3447774674968387e-07,
"loss": -0.0071,
"reward": 1.3663478195667267,
"reward_std": 0.6112166717648506,
"rewards/accuracy_reward": 0.7276786118745804,
"rewards/cosine_scaled_reward": 0.6386693120002747,
"step": 299
},
{
"completion_length": 1364.7634887695312,
"epoch": 0.8888888888888888,
"grad_norm": 0.6605478512720968,
"kl": 0.256103515625,
"learning_rate": 1.3270902990303869e-07,
"loss": 0.1078,
"reward": 1.3349690437316895,
"reward_std": 0.43663863837718964,
"rewards/accuracy_reward": 0.7321428954601288,
"rewards/cosine_scaled_reward": 0.6028260812163353,
"step": 300
},
{
"completion_length": 989.0045013427734,
"epoch": 0.8918518518518519,
"grad_norm": 0.5497470505531581,
"kl": 0.297119140625,
"learning_rate": 1.3098517205605325e-07,
"loss": -0.0579,
"reward": 1.1443769484758377,
"reward_std": 0.5397379323840141,
"rewards/accuracy_reward": 0.6250000149011612,
"rewards/cosine_scaled_reward": 0.5193769186735153,
"step": 301
},
{
"completion_length": 1223.0357666015625,
"epoch": 0.8948148148148148,
"grad_norm": 0.44189728480389967,
"kl": 0.241943359375,
"learning_rate": 1.2930635852435634e-07,
"loss": -0.0605,
"reward": 1.2372848689556122,
"reward_std": 0.6824733465909958,
"rewards/accuracy_reward": 0.674107164144516,
"rewards/cosine_scaled_reward": 0.563177689909935,
"step": 302
},
{
"completion_length": 1622.5179138183594,
"epoch": 0.8977777777777778,
"grad_norm": 0.3848582793128278,
"kl": 0.2061767578125,
"learning_rate": 1.276727697812894e-07,
"loss": 0.0914,
"reward": 1.2423473447561264,
"reward_std": 0.647514745593071,
"rewards/accuracy_reward": 0.6741071790456772,
"rewards/cosine_scaled_reward": 0.5682401582598686,
"step": 303
},
{
"completion_length": 1614.9286193847656,
"epoch": 0.9007407407407407,
"grad_norm": 0.3007297849667304,
"kl": 0.209716796875,
"learning_rate": 1.2608458143850493e-07,
"loss": 0.0022,
"reward": 1.2776865363121033,
"reward_std": 0.6378434896469116,
"rewards/accuracy_reward": 0.7008928805589676,
"rewards/cosine_scaled_reward": 0.5767936706542969,
"step": 304
},
{
"completion_length": 1398.571517944336,
"epoch": 0.9037037037037037,
"grad_norm": 0.4951854800661223,
"kl": 0.2578125,
"learning_rate": 1.2454196422708843e-07,
"loss": 0.0546,
"reward": 0.9712510854005814,
"reward_std": 0.5841480642557144,
"rewards/accuracy_reward": 0.5625000223517418,
"rewards/cosine_scaled_reward": 0.40875105932354927,
"step": 305
},
{
"completion_length": 1390.5759353637695,
"epoch": 0.9066666666666666,
"grad_norm": 0.5060958415409001,
"kl": 0.2391357421875,
"learning_rate": 1.2304508397920499e-07,
"loss": 0.0294,
"reward": 1.0037438869476318,
"reward_std": 0.6964580416679382,
"rewards/accuracy_reward": 0.549107164144516,
"rewards/cosine_scaled_reward": 0.45463668555021286,
"step": 306
},
{
"completion_length": 1332.0715103149414,
"epoch": 0.9096296296296297,
"grad_norm": 0.40475083679377444,
"kl": 0.251708984375,
"learning_rate": 1.2159410161027153e-07,
"loss": 0.0844,
"reward": 1.3975183367729187,
"reward_std": 0.5006646141409874,
"rewards/accuracy_reward": 0.754464328289032,
"rewards/cosine_scaled_reward": 0.6430540382862091,
"step": 307
},
{
"completion_length": 1306.8259582519531,
"epoch": 0.9125925925925926,
"grad_norm": 0.33395393560033776,
"kl": 0.218505859375,
"learning_rate": 1.2018917310165926e-07,
"loss": 0.016,
"reward": 1.2559349834918976,
"reward_std": 0.5287806503474712,
"rewards/accuracy_reward": 0.7008928954601288,
"rewards/cosine_scaled_reward": 0.5550421252846718,
"step": 308
},
{
"completion_length": 1175.1116333007812,
"epoch": 0.9155555555555556,
"grad_norm": 0.45684265652865813,
"kl": 0.2197265625,
"learning_rate": 1.1883044948392453e-07,
"loss": 0.1229,
"reward": 1.2261989116668701,
"reward_std": 0.44794493168592453,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/cosine_scaled_reward": 0.5386988818645477,
"step": 309
},
{
"completion_length": 1192.4420471191406,
"epoch": 0.9185185185185185,
"grad_norm": 0.5819087268590017,
"kl": 0.321533203125,
"learning_rate": 1.1751807682057396e-07,
"loss": -0.0148,
"reward": 1.4763469398021698,
"reward_std": 0.473043292760849,
"rewards/accuracy_reward": 0.7812500447034836,
"rewards/cosine_scaled_reward": 0.6950969099998474,
"step": 310
},
{
"completion_length": 1254.6205978393555,
"epoch": 0.9214814814814815,
"grad_norm": 0.5149829152884512,
"kl": 0.271240234375,
"learning_rate": 1.1625219619236196e-07,
"loss": -0.0156,
"reward": 1.2878541946411133,
"reward_std": 0.5938218757510185,
"rewards/accuracy_reward": 0.7053571790456772,
"rewards/cosine_scaled_reward": 0.5824970304965973,
"step": 311
},
{
"completion_length": 1246.2545013427734,
"epoch": 0.9244444444444444,
"grad_norm": 0.5342345338890936,
"kl": 0.28076171875,
"learning_rate": 1.1503294368212441e-07,
"loss": -0.0491,
"reward": 1.23801089823246,
"reward_std": 0.48125500977039337,
"rewards/accuracy_reward": 0.6651786118745804,
"rewards/cosine_scaled_reward": 0.5728322416543961,
"step": 312
},
{
"completion_length": 1623.9777221679688,
"epoch": 0.9274074074074075,
"grad_norm": 0.292124358723905,
"kl": 0.16351318359375,
"learning_rate": 1.1386045036015024e-07,
"loss": -0.011,
"reward": 1.1545456051826477,
"reward_std": 0.5948602706193924,
"rewards/accuracy_reward": 0.6607143133878708,
"rewards/cosine_scaled_reward": 0.4938312843441963,
"step": 313
},
{
"completion_length": 1209.8080596923828,
"epoch": 0.9303703703703704,
"grad_norm": 0.889231001318171,
"kl": 0.288818359375,
"learning_rate": 1.1273484227009072e-07,
"loss": 0.0213,
"reward": 1.2507951855659485,
"reward_std": 0.6353217959403992,
"rewards/accuracy_reward": 0.6785714477300644,
"rewards/cosine_scaled_reward": 0.5722237303853035,
"step": 314
},
{
"completion_length": 1585.1250915527344,
"epoch": 0.9333333333333333,
"grad_norm": 0.4536490161605513,
"kl": 0.222412109375,
"learning_rate": 1.116562404154099e-07,
"loss": 0.0189,
"reward": 1.1351844668388367,
"reward_std": 0.535600557923317,
"rewards/accuracy_reward": 0.6383928656578064,
"rewards/cosine_scaled_reward": 0.4967915639281273,
"step": 315
},
{
"completion_length": 871.2946929931641,
"epoch": 0.9362962962962963,
"grad_norm": 0.596048822760235,
"kl": 0.28515625,
"learning_rate": 1.1062476074637685e-07,
"loss": 0.1252,
"reward": 1.359487771987915,
"reward_std": 0.4618111401796341,
"rewards/accuracy_reward": 0.7187500447034836,
"rewards/cosine_scaled_reward": 0.6407377123832703,
"step": 316
},
{
"completion_length": 1378.9553985595703,
"epoch": 0.9392592592592592,
"grad_norm": 0.4592747700126037,
"kl": 0.260986328125,
"learning_rate": 1.0964051414760065e-07,
"loss": 0.0193,
"reward": 1.323800265789032,
"reward_std": 0.47177664190530777,
"rewards/accuracy_reward": 0.7187500447034836,
"rewards/cosine_scaled_reward": 0.6050502583384514,
"step": 317
},
{
"completion_length": 858.9018096923828,
"epoch": 0.9422222222222222,
"grad_norm": 0.6123065766410447,
"kl": 0.33349609375,
"learning_rate": 1.087036064261106e-07,
"loss": 0.0145,
"reward": 1.4594223201274872,
"reward_std": 0.5125085860490799,
"rewards/accuracy_reward": 0.7633928954601288,
"rewards/cosine_scaled_reward": 0.696029394865036,
"step": 318
},
{
"completion_length": 1336.3348846435547,
"epoch": 0.9451851851851852,
"grad_norm": 0.39416387811795184,
"kl": 0.247314453125,
"learning_rate": 1.0781413829998135e-07,
"loss": -0.0303,
"reward": 1.2021480649709702,
"reward_std": 0.5442958772182465,
"rewards/accuracy_reward": 0.6696428805589676,
"rewards/cosine_scaled_reward": 0.5325051471590996,
"step": 319
},
{
"completion_length": 1438.3973999023438,
"epoch": 0.9481481481481482,
"grad_norm": 0.43467725744059227,
"kl": 0.255859375,
"learning_rate": 1.0697220538750631e-07,
"loss": 0.0463,
"reward": 1.1738777160644531,
"reward_std": 0.6729736477136612,
"rewards/accuracy_reward": 0.6428571790456772,
"rewards/cosine_scaled_reward": 0.5310205593705177,
"step": 320
},
{
"completion_length": 1484.1072387695312,
"epoch": 0.9511111111111111,
"grad_norm": 0.45452453298221474,
"kl": 0.2275390625,
"learning_rate": 1.0617789819691819e-07,
"loss": -0.0514,
"reward": 1.105325609445572,
"reward_std": 0.6967962235212326,
"rewards/accuracy_reward": 0.6205357387661934,
"rewards/cosine_scaled_reward": 0.48478981852531433,
"step": 321
},
{
"completion_length": 1666.290283203125,
"epoch": 0.9540740740740741,
"grad_norm": 0.39839803599498697,
"kl": 0.240966796875,
"learning_rate": 1.054313021166595e-07,
"loss": 0.0017,
"reward": 1.1861306875944138,
"reward_std": 0.6368418782949448,
"rewards/accuracy_reward": 0.660714328289032,
"rewards/cosine_scaled_reward": 0.5254162922501564,
"step": 322
},
{
"completion_length": 1225.4197006225586,
"epoch": 0.957037037037037,
"grad_norm": 0.5218036130830189,
"kl": 0.220703125,
"learning_rate": 1.0473249740620304e-07,
"loss": -0.0204,
"reward": 1.1594546139240265,
"reward_std": 0.6208428591489792,
"rewards/accuracy_reward": 0.6339285969734192,
"rewards/cosine_scaled_reward": 0.5255259871482849,
"step": 323
},
{
"completion_length": 1377.6339721679688,
"epoch": 0.96,
"grad_norm": 0.39857455474607256,
"kl": 0.248291015625,
"learning_rate": 1.0408155918742432e-07,
"loss": -0.0069,
"reward": 1.1889010518789291,
"reward_std": 0.5303689762949944,
"rewards/accuracy_reward": 0.6651786118745804,
"rewards/cosine_scaled_reward": 0.5237224623560905,
"step": 324
},
{
"completion_length": 1925.6072387695312,
"epoch": 0.9629629629629629,
"grad_norm": 0.4234383833079227,
"kl": 0.2197265625,
"learning_rate": 1.034785574365256e-07,
"loss": 0.0489,
"reward": 1.1286714375019073,
"reward_std": 0.564603678882122,
"rewards/accuracy_reward": 0.6294643133878708,
"rewards/cosine_scaled_reward": 0.4992070645093918,
"step": 325
},
{
"completion_length": 1323.1027221679688,
"epoch": 0.965925925925926,
"grad_norm": 0.4937948952732904,
"kl": 0.270263671875,
"learning_rate": 1.0292355697651348e-07,
"loss": 0.0144,
"reward": 0.9994739443063736,
"reward_std": 0.641077071428299,
"rewards/accuracy_reward": 0.5758928805589676,
"rewards/cosine_scaled_reward": 0.42358100414276123,
"step": 326
},
{
"completion_length": 910.9375457763672,
"epoch": 0.9688888888888889,
"grad_norm": 0.5236547427670164,
"kl": 0.32568359375,
"learning_rate": 1.0241661747023064e-07,
"loss": 0.0655,
"reward": 1.1969931423664093,
"reward_std": 0.5603185072541237,
"rewards/accuracy_reward": 0.6473214477300644,
"rewards/cosine_scaled_reward": 0.5496717244386673,
"step": 327
},
{
"completion_length": 1499.5313110351562,
"epoch": 0.9718518518518519,
"grad_norm": 0.5317429397319068,
"kl": 0.257568359375,
"learning_rate": 1.0195779341394164e-07,
"loss": 0.129,
"reward": 1.354402244091034,
"reward_std": 0.4894205704331398,
"rewards/accuracy_reward": 0.7321428805589676,
"rewards/cosine_scaled_reward": 0.6222593784332275,
"step": 328
},
{
"completion_length": 1111.9330749511719,
"epoch": 0.9748148148148148,
"grad_norm": 0.5147699965476735,
"kl": 0.260498046875,
"learning_rate": 1.0154713413147486e-07,
"loss": 0.078,
"reward": 1.2603758871555328,
"reward_std": 0.47384266555309296,
"rewards/accuracy_reward": 0.7098214626312256,
"rewards/cosine_scaled_reward": 0.5505543872714043,
"step": 329
},
{
"completion_length": 1475.4911346435547,
"epoch": 0.9777777777777777,
"grad_norm": 0.42585894642650685,
"kl": 0.23876953125,
"learning_rate": 1.0118468376892005e-07,
"loss": 0.0107,
"reward": 1.1338723003864288,
"reward_std": 0.4878518432378769,
"rewards/accuracy_reward": 0.6250000298023224,
"rewards/cosine_scaled_reward": 0.5088722705841064,
"step": 330
},
{
"completion_length": 1144.6652374267578,
"epoch": 0.9807407407407407,
"grad_norm": 0.5024574257196305,
"kl": 0.282470703125,
"learning_rate": 1.0087048128988256e-07,
"loss": 0.0042,
"reward": 1.372282713651657,
"reward_std": 0.4418197050690651,
"rewards/accuracy_reward": 0.7410714477300644,
"rewards/cosine_scaled_reward": 0.6312113404273987,
"step": 331
},
{
"completion_length": 1154.2723846435547,
"epoch": 0.9837037037037037,
"grad_norm": 0.5099595034924427,
"kl": 0.30908203125,
"learning_rate": 1.0060456047129485e-07,
"loss": 0.0052,
"reward": 1.4354938864707947,
"reward_std": 0.5711122378706932,
"rewards/accuracy_reward": 0.7633928954601288,
"rewards/cosine_scaled_reward": 0.6721010059118271,
"step": 332
},
{
"completion_length": 1347.49560546875,
"epoch": 0.9866666666666667,
"grad_norm": 0.41094105741848364,
"kl": 0.214111328125,
"learning_rate": 1.0038694989978531e-07,
"loss": -0.0321,
"reward": 1.2599745690822601,
"reward_std": 0.5518456846475601,
"rewards/accuracy_reward": 0.6919643133878708,
"rewards/cosine_scaled_reward": 0.5680102482438087,
"step": 333
},
{
"completion_length": 1579.7054138183594,
"epoch": 0.9896296296296296,
"grad_norm": 0.3982623902115583,
"kl": 0.1856689453125,
"learning_rate": 1.0021767296860537e-07,
"loss": 0.1626,
"reward": 1.2055756747722626,
"reward_std": 0.5490602627396584,
"rewards/accuracy_reward": 0.6696428805589676,
"rewards/cosine_scaled_reward": 0.535932794213295,
"step": 334
},
{
"completion_length": 1537.3438110351562,
"epoch": 0.9925925925925926,
"grad_norm": 0.43671825802827985,
"kl": 0.238037109375,
"learning_rate": 1.0009674787511447e-07,
"loss": -0.0007,
"reward": 1.2209820598363876,
"reward_std": 0.5103197321295738,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/cosine_scaled_reward": 0.542410634458065,
"step": 335
},
{
"completion_length": 1196.111686706543,
"epoch": 0.9955555555555555,
"grad_norm": 0.6107691362588656,
"kl": 0.291259765625,
"learning_rate": 1.0002418761882409e-07,
"loss": 0.0202,
"reward": 1.3667995631694794,
"reward_std": 0.44663529843091965,
"rewards/accuracy_reward": 0.7366071790456772,
"rewards/cosine_scaled_reward": 0.6301924362778664,
"step": 336
},
{
"completion_length": 1011.8393211364746,
"epoch": 0.9985185185185185,
"grad_norm": 0.7265395322809004,
"kl": 0.2783203125,
"learning_rate": 1e-07,
"loss": 0.1413,
"reward": 1.3486962914466858,
"reward_std": 0.43993912637233734,
"rewards/accuracy_reward": 0.7276785969734192,
"rewards/cosine_scaled_reward": 0.621017687022686,
"step": 337
},
{
"epoch": 0.9985185185185185,
"step": 337,
"total_flos": 0.0,
"train_loss": 0.028963628810559583,
"train_runtime": 59585.1579,
"train_samples_per_second": 0.181,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 337,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}