Qwen2.5-1.5B-Open-R1-Code-GRPO / trainer_state.json
CM's picture
Model save
fa33b89 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.8062015503875966,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 486.7500228881836,
"epoch": 0.015503875968992248,
"grad_norm": 0.26961565017700195,
"kl": 0.0,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0045,
"reward": 0.0035714288242161274,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0357142873108387,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 466.6964416503906,
"epoch": 0.031007751937984496,
"grad_norm": 0.3279306888580322,
"kl": 0.0,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0129,
"reward": 0.010714286705479026,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.10714286379516125,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 561.5714569091797,
"epoch": 0.046511627906976744,
"grad_norm": 0.14393426477909088,
"kl": 0.0003495216369628906,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0023,
"reward": 0.0017857144121080637,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.01785714365541935,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 548.8750228881836,
"epoch": 0.06201550387596899,
"grad_norm": 0.0009427572367712855,
"kl": 0.00028252601623535156,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 497.7857360839844,
"epoch": 0.07751937984496124,
"grad_norm": 0.19416852295398712,
"kl": 0.0003070831298828125,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0211,
"reward": 0.0035714288242161274,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0357142873108387,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 551.0893173217773,
"epoch": 0.09302325581395349,
"grad_norm": 0.2645115256309509,
"kl": 0.00034809112548828125,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0314,
"reward": 0.005357143469154835,
"reward_std": 0.007576144300401211,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0535714328289032,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 525.3750152587891,
"epoch": 0.10852713178294573,
"grad_norm": 0.4168189764022827,
"kl": 0.00038433074951171875,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0174,
"reward": 0.012500000884756446,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.12500000558793545,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 449.35716247558594,
"epoch": 0.12403100775193798,
"grad_norm": 0.26261523365974426,
"kl": 0.0006132125854492188,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0101,
"reward": 0.0035714288242161274,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0357142873108387,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 517.0178833007812,
"epoch": 0.13953488372093023,
"grad_norm": 0.38538143038749695,
"kl": 0.001514434814453125,
"learning_rate": 3e-06,
"loss": 0.0096,
"reward": 0.008928572293370962,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0892857201397419,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 548.1964492797852,
"epoch": 0.15503875968992248,
"grad_norm": 0.2276693731546402,
"kl": 0.0036163330078125,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.006,
"reward": 0.010714286705479026,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.10714286379516125,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 503.0714569091797,
"epoch": 0.17054263565891473,
"grad_norm": 0.45380452275276184,
"kl": 0.0091552734375,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0442,
"reward": 0.014285715529695153,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.14285715110599995,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 499.9285888671875,
"epoch": 0.18604651162790697,
"grad_norm": 0.4393024444580078,
"kl": 0.0289306640625,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0051,
"reward": 0.028571431059390306,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.2857142947614193,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 444.10716247558594,
"epoch": 0.20155038759689922,
"grad_norm": 0.8636987209320068,
"kl": 0.110595703125,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0191,
"reward": 0.026785716880112886,
"reward_std": 0.03282995941117406,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.2678571604192257,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 417.57144927978516,
"epoch": 0.21705426356589147,
"grad_norm": 0.6144067645072937,
"kl": 0.06768798828125,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0193,
"reward": 0.03928571753203869,
"reward_std": 0.030304578132927418,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.3928571566939354,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 513.9107360839844,
"epoch": 0.23255813953488372,
"grad_norm": 0.5421202778816223,
"kl": 0.043365478515625,
"learning_rate": 5e-06,
"loss": 0.0393,
"reward": 0.04107143264263868,
"reward_std": 0.03282995941117406,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.4107143059372902,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 529.9464492797852,
"epoch": 0.24806201550387597,
"grad_norm": 1.8159050941467285,
"kl": 0.0850830078125,
"learning_rate": 4.999952797253148e-06,
"loss": 0.093,
"reward": 0.03928571753203869,
"reward_std": 0.04040610417723656,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.3928571566939354,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 457.10716247558594,
"epoch": 0.26356589147286824,
"grad_norm": 0.5563008189201355,
"kl": 0.05609130859375,
"learning_rate": 4.9998111909931225e-06,
"loss": 0.0187,
"reward": 0.03392857359722257,
"reward_std": 0.022728432901203632,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.3392857201397419,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 478.19644927978516,
"epoch": 0.27906976744186046,
"grad_norm": 0.8018612861633301,
"kl": 0.04034423828125,
"learning_rate": 4.999575187161439e-06,
"loss": 0.0407,
"reward": 0.04642857518047094,
"reward_std": 0.04545686719939113,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.4642857313156128,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 454.67860412597656,
"epoch": 0.29457364341085274,
"grad_norm": 0.4552249312400818,
"kl": 0.04693603515625,
"learning_rate": 4.9992447956603455e-06,
"loss": 0.013,
"reward": 0.055357146076858044,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5535714477300644,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 398.14288330078125,
"epoch": 0.31007751937984496,
"grad_norm": 0.6622409224510193,
"kl": 0.07818603515625,
"learning_rate": 4.998820030352409e-06,
"loss": 0.0427,
"reward": 0.06964286230504513,
"reward_std": 0.027779196621850133,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6964286118745804,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 411.42859649658203,
"epoch": 0.32558139534883723,
"grad_norm": 0.5362735390663147,
"kl": 0.0372314453125,
"learning_rate": 4.998300909059929e-06,
"loss": 0.0515,
"reward": 0.07857143320143223,
"reward_std": 0.025253815110772848,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.785714328289032,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 391.0000228881836,
"epoch": 0.34108527131782945,
"grad_norm": 0.5802915692329407,
"kl": 0.0638427734375,
"learning_rate": 4.997687453564198e-06,
"loss": 0.0016,
"reward": 0.06964286044239998,
"reward_std": 0.02777919638901949,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6964285969734192,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 352.16072845458984,
"epoch": 0.35658914728682173,
"grad_norm": 0.6233690977096558,
"kl": 0.1044921875,
"learning_rate": 4.9969796896045775e-06,
"loss": 0.0163,
"reward": 0.07500000484287739,
"reward_std": 0.02525381464511156,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7500000447034836,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 357.23216247558594,
"epoch": 0.37209302325581395,
"grad_norm": 0.5452607274055481,
"kl": 0.07611083984375,
"learning_rate": 4.996177646877426e-06,
"loss": 0.0347,
"reward": 0.0803571492433548,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 334.1428680419922,
"epoch": 0.3875968992248062,
"grad_norm": 0.2985096275806427,
"kl": 0.03887939453125,
"learning_rate": 4.995281359034851e-06,
"loss": 0.0163,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 346.44644927978516,
"epoch": 0.40310077519379844,
"grad_norm": 0.5771266222000122,
"kl": 0.145263671875,
"learning_rate": 4.994290863683296e-06,
"loss": 0.0187,
"reward": 0.08750000782310963,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 342.21429443359375,
"epoch": 0.4186046511627907,
"grad_norm": 1.0314879417419434,
"kl": 0.207275390625,
"learning_rate": 4.99320620238196e-06,
"loss": 0.0203,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 317.4107360839844,
"epoch": 0.43410852713178294,
"grad_norm": 0.205219104886055,
"kl": 0.04510498046875,
"learning_rate": 4.99202742064106e-06,
"loss": -0.0048,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 315.37501525878906,
"epoch": 0.4496124031007752,
"grad_norm": 0.5641531944274902,
"kl": 0.0771484375,
"learning_rate": 4.990754567919917e-06,
"loss": -0.0044,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 315.78572845458984,
"epoch": 0.46511627906976744,
"grad_norm": 0.15068137645721436,
"kl": 0.044952392578125,
"learning_rate": 4.989387697624881e-06,
"loss": -0.0039,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 362.2500228881836,
"epoch": 0.4806201550387597,
"grad_norm": 0.42976486682891846,
"kl": 0.083160400390625,
"learning_rate": 4.987926867107095e-06,
"loss": 0.0262,
"reward": 0.08571429178118706,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 328.03572845458984,
"epoch": 0.49612403100775193,
"grad_norm": 0.3264504373073578,
"kl": 0.0775146484375,
"learning_rate": 4.986372137660078e-06,
"loss": 0.0051,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 318.1071548461914,
"epoch": 0.5116279069767442,
"grad_norm": 0.29669874906539917,
"kl": 0.0478515625,
"learning_rate": 4.984723574517165e-06,
"loss": 0.0067,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 298.25001525878906,
"epoch": 0.5271317829457365,
"grad_norm": 0.14518219232559204,
"kl": 0.0523681640625,
"learning_rate": 4.9829812468487655e-06,
"loss": -0.0005,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 367.8393020629883,
"epoch": 0.5426356589147286,
"grad_norm": 0.2966887354850769,
"kl": 0.0465087890625,
"learning_rate": 4.981145227759457e-06,
"loss": 0.0274,
"reward": 0.09285714849829674,
"reward_std": 0.010101525811478496,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714477300644,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 377.19644927978516,
"epoch": 0.5581395348837209,
"grad_norm": 0.2373329997062683,
"kl": 0.04595947265625,
"learning_rate": 4.979215594284924e-06,
"loss": -0.0014,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 376.28572845458984,
"epoch": 0.5736434108527132,
"grad_norm": 0.14644701778888702,
"kl": 0.04345703125,
"learning_rate": 4.977192427388722e-06,
"loss": 0.0002,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 360.6964416503906,
"epoch": 0.5891472868217055,
"grad_norm": 0.1754084676504135,
"kl": 0.04449462890625,
"learning_rate": 4.9750758119588824e-06,
"loss": -0.0018,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 420.2857437133789,
"epoch": 0.6046511627906976,
"grad_norm": 0.35420262813568115,
"kl": 0.05120849609375,
"learning_rate": 4.972865836804349e-06,
"loss": 0.0194,
"reward": 0.08928572200238705,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 306.25000762939453,
"epoch": 0.6201550387596899,
"grad_norm": 0.1385747194290161,
"kl": 0.03955078125,
"learning_rate": 4.970562594651254e-06,
"loss": 0.0075,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 383.6607360839844,
"epoch": 0.6356589147286822,
"grad_norm": 0.17503131926059723,
"kl": 0.03948974609375,
"learning_rate": 4.968166182139026e-06,
"loss": 0.0062,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 440.57144927978516,
"epoch": 0.6511627906976745,
"grad_norm": 0.1598208099603653,
"kl": 0.04473876953125,
"learning_rate": 4.9656766998163306e-06,
"loss": -0.0023,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 337.1428680419922,
"epoch": 0.6666666666666666,
"grad_norm": 0.23992463946342468,
"kl": 0.05145263671875,
"learning_rate": 4.963094252136865e-06,
"loss": -0.0088,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 359.94644927978516,
"epoch": 0.6821705426356589,
"grad_norm": 0.30762046575546265,
"kl": 0.048828125,
"learning_rate": 4.960418947454958e-06,
"loss": -0.007,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 370.23216247558594,
"epoch": 0.6976744186046512,
"grad_norm": 0.15281742811203003,
"kl": 0.04937744140625,
"learning_rate": 4.957650898021038e-06,
"loss": -0.0022,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 393.32144927978516,
"epoch": 0.7131782945736435,
"grad_norm": 0.1467144638299942,
"kl": 0.04290771484375,
"learning_rate": 4.954790219976915e-06,
"loss": -0.0129,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 395.44644927978516,
"epoch": 0.7286821705426356,
"grad_norm": 0.24572958052158356,
"kl": 0.04949951171875,
"learning_rate": 4.95183703335091e-06,
"loss": 0.0161,
"reward": 0.09464286267757416,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 401.5357360839844,
"epoch": 0.7441860465116279,
"grad_norm": 0.2575523853302002,
"kl": 0.052490234375,
"learning_rate": 4.948791462052819e-06,
"loss": -0.014,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 385.5178680419922,
"epoch": 0.7596899224806202,
"grad_norm": 0.14166215062141418,
"kl": 0.06591796875,
"learning_rate": 4.945653633868716e-06,
"loss": 0.0074,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 411.2500228881836,
"epoch": 0.7751937984496124,
"grad_norm": 0.2869671583175659,
"kl": 0.04833984375,
"learning_rate": 4.942423680455584e-06,
"loss": 0.0156,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 387.4643020629883,
"epoch": 0.7906976744186046,
"grad_norm": 0.30038827657699585,
"kl": 0.061279296875,
"learning_rate": 4.939101737335802e-06,
"loss": -0.0201,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 398.2678756713867,
"epoch": 0.8062015503875969,
"grad_norm": 0.2649737596511841,
"kl": 0.08355712890625,
"learning_rate": 4.935687943891447e-06,
"loss": 0.0014,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 345.3214340209961,
"epoch": 0.8217054263565892,
"grad_norm": 0.17817121744155884,
"kl": 0.06097412109375,
"learning_rate": 4.932182443358458e-06,
"loss": 0.002,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 378.82144927978516,
"epoch": 0.8372093023255814,
"grad_norm": 0.2614600360393524,
"kl": 0.05889892578125,
"learning_rate": 4.928585382820616e-06,
"loss": 0.002,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 395.3035888671875,
"epoch": 0.8527131782945736,
"grad_norm": 0.19784440100193024,
"kl": 0.0565185546875,
"learning_rate": 4.924896913203376e-06,
"loss": 0.0135,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 370.87501525878906,
"epoch": 0.8682170542635659,
"grad_norm": 0.2502836585044861,
"kl": 0.05908203125,
"learning_rate": 4.921117189267535e-06,
"loss": 0.0157,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 391.4285888671875,
"epoch": 0.8837209302325582,
"grad_norm": 0.18611028790473938,
"kl": 0.06365966796875,
"learning_rate": 4.917246369602742e-06,
"loss": -0.0074,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 406.4643020629883,
"epoch": 0.8992248062015504,
"grad_norm": 0.23732154071331024,
"kl": 0.0538330078125,
"learning_rate": 4.9132846166208355e-06,
"loss": 0.0058,
"reward": 0.09642857685685158,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 449.94644927978516,
"epoch": 0.9147286821705426,
"grad_norm": 0.1505957543849945,
"kl": 0.046630859375,
"learning_rate": 4.9092320965490365e-06,
"loss": 0.0166,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 458.14288330078125,
"epoch": 0.9302325581395349,
"grad_norm": 0.22972935438156128,
"kl": 0.04986572265625,
"learning_rate": 4.905088979422971e-06,
"loss": 0.0175,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 334.3035888671875,
"epoch": 0.9457364341085271,
"grad_norm": 0.3014618158340454,
"kl": 0.0665283203125,
"learning_rate": 4.900855439079536e-06,
"loss": -0.003,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 461.3750228881836,
"epoch": 0.9612403100775194,
"grad_norm": 0.22455042600631714,
"kl": 0.059326171875,
"learning_rate": 4.8965316531496055e-06,
"loss": 0.0138,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 458.57144927978516,
"epoch": 0.9767441860465116,
"grad_norm": 0.12987832725048065,
"kl": 0.05975341796875,
"learning_rate": 4.892117803050578e-06,
"loss": 0.0333,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 380.4375,
"epoch": 0.9922480620155039,
"grad_norm": 0.029962124302983284,
"kl": 0.0517578125,
"learning_rate": 4.887614073978761e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 375.07144927978516,
"epoch": 1.0155038759689923,
"grad_norm": 0.20877622067928314,
"kl": 0.06829833984375,
"learning_rate": 4.883020654901609e-06,
"loss": -0.0029,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 395.5178756713867,
"epoch": 1.0310077519379846,
"grad_norm": 0.2041924148797989,
"kl": 0.06427001953125,
"learning_rate": 4.878337738549785e-06,
"loss": -0.0086,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 385.44644927978516,
"epoch": 1.0465116279069768,
"grad_norm": 0.1686294972896576,
"kl": 0.044921875,
"learning_rate": 4.873565521409082e-06,
"loss": 0.0128,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 349.25001525878906,
"epoch": 1.062015503875969,
"grad_norm": 0.22423683106899261,
"kl": 0.0953369140625,
"learning_rate": 4.868704203712173e-06,
"loss": -0.0024,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 441.6964416503906,
"epoch": 1.0775193798449612,
"grad_norm": 0.335616797208786,
"kl": 0.0814208984375,
"learning_rate": 4.86375398943021e-06,
"loss": 0.0239,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 358.3928756713867,
"epoch": 1.0930232558139534,
"grad_norm": 0.019625969231128693,
"kl": 0.044921875,
"learning_rate": 4.858715086264274e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 369.2500228881836,
"epoch": 1.1085271317829457,
"grad_norm": 0.017356975004076958,
"kl": 0.04278564453125,
"learning_rate": 4.853587705636646e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 415.2678756713867,
"epoch": 1.124031007751938,
"grad_norm": 1.0988541841506958,
"kl": 0.21514892578125,
"learning_rate": 4.84837206268195e-06,
"loss": -0.0002,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 401.60716247558594,
"epoch": 1.1395348837209303,
"grad_norm": 0.07964562624692917,
"kl": 0.0548095703125,
"learning_rate": 4.8430683762381195e-06,
"loss": 0.0181,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 433.8571548461914,
"epoch": 1.1550387596899225,
"grad_norm": 0.3271082043647766,
"kl": 0.067138671875,
"learning_rate": 4.837676868837213e-06,
"loss": 0.0282,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 473.7500228881836,
"epoch": 1.1705426356589148,
"grad_norm": 0.11898969113826752,
"kl": 0.04522705078125,
"learning_rate": 4.832197766696085e-06,
"loss": 0.0467,
"reward": 0.09464286267757416,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 332.6250228881836,
"epoch": 1.1860465116279069,
"grad_norm": 0.21207605302333832,
"kl": 0.08026123046875,
"learning_rate": 4.826631299706887e-06,
"loss": -0.0032,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 379.4285888671875,
"epoch": 1.2015503875968991,
"grad_norm": 0.22306586802005768,
"kl": 0.062255859375,
"learning_rate": 4.820977701427424e-06,
"loss": 0.0056,
"reward": 0.09642857685685158,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 338.7143020629883,
"epoch": 1.2170542635658914,
"grad_norm": 0.16933457553386688,
"kl": 0.064208984375,
"learning_rate": 4.81523720907136e-06,
"loss": -0.0063,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 362.8393020629883,
"epoch": 1.2325581395348837,
"grad_norm": 0.8381237983703613,
"kl": 0.0596923828125,
"learning_rate": 4.809410063498254e-06,
"loss": 0.0006,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 414.0714416503906,
"epoch": 1.248062015503876,
"grad_norm": 0.20738717913627625,
"kl": 0.05206298828125,
"learning_rate": 4.8034965092034656e-06,
"loss": 0.0313,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 407.4285888671875,
"epoch": 1.2635658914728682,
"grad_norm": 0.3369165062904358,
"kl": 0.08013916015625,
"learning_rate": 4.797496794307889e-06,
"loss": 0.0061,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 408.1071548461914,
"epoch": 1.2790697674418605,
"grad_norm": 6.407706260681152,
"kl": 1.0523681640625,
"learning_rate": 4.791411170547545e-06,
"loss": 0.0007,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 340.71429443359375,
"epoch": 1.2945736434108528,
"grad_norm": 0.1786692589521408,
"kl": 0.05548095703125,
"learning_rate": 4.785239893263017e-06,
"loss": 0.0136,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 381.69644927978516,
"epoch": 1.310077519379845,
"grad_norm": 0.06986937671899796,
"kl": 0.0423583984375,
"learning_rate": 4.778983221388742e-06,
"loss": 0.0186,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 395.60716247558594,
"epoch": 1.3255813953488373,
"grad_norm": 1.0251904726028442,
"kl": 0.088134765625,
"learning_rate": 4.77264141744214e-06,
"loss": -0.0158,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 360.3393020629883,
"epoch": 1.3410852713178294,
"grad_norm": 0.41934439539909363,
"kl": 0.08447265625,
"learning_rate": 4.766214747512603e-06,
"loss": -0.009,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 382.8214416503906,
"epoch": 1.3565891472868217,
"grad_norm": 0.2629234194755554,
"kl": 0.05426025390625,
"learning_rate": 4.759703481250331e-06,
"loss": 0.0143,
"reward": 0.09464286454021931,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 319.1607360839844,
"epoch": 1.372093023255814,
"grad_norm": 0.4782038629055023,
"kl": 0.085693359375,
"learning_rate": 4.753107891855015e-06,
"loss": -0.0169,
"reward": 0.08571429178118706,
"reward_std": 0.02020305162295699,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 361.7678680419922,
"epoch": 1.3875968992248062,
"grad_norm": 0.3007795512676239,
"kl": 0.0552978515625,
"learning_rate": 4.746428256064375e-06,
"loss": 0.0267,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 513.6785888671875,
"epoch": 1.4031007751937985,
"grad_norm": 0.37290942668914795,
"kl": 0.06085205078125,
"learning_rate": 4.7396648541425534e-06,
"loss": 0.0452,
"reward": 0.0857142936438322,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 487.2143020629883,
"epoch": 1.4186046511627908,
"grad_norm": 0.27805617451667786,
"kl": 0.06781005859375,
"learning_rate": 4.732817969868348e-06,
"loss": 0.0474,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 458.76788330078125,
"epoch": 1.4341085271317828,
"grad_norm": 0.23480121791362762,
"kl": 0.0584716796875,
"learning_rate": 4.7258878905233095e-06,
"loss": 0.0453,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 450.4643096923828,
"epoch": 1.449612403100775,
"grad_norm": 0.3803044855594635,
"kl": 0.0882568359375,
"learning_rate": 4.718874906879688e-06,
"loss": 0.0698,
"reward": 0.08214286155998707,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 431.6071548461914,
"epoch": 1.4651162790697674,
"grad_norm": 0.2859114110469818,
"kl": 0.05865478515625,
"learning_rate": 4.711779313188231e-06,
"loss": 0.0484,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 482.3750228881836,
"epoch": 1.4806201550387597,
"grad_norm": 0.27567797899246216,
"kl": 0.06036376953125,
"learning_rate": 4.70460140716584e-06,
"loss": 0.0909,
"reward": 0.08750000782310963,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 369.0714416503906,
"epoch": 1.496124031007752,
"grad_norm": 0.29379433393478394,
"kl": 0.05914306640625,
"learning_rate": 4.697341489983076e-06,
"loss": 0.0258,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 355.6428756713867,
"epoch": 1.5116279069767442,
"grad_norm": 0.34517690539360046,
"kl": 0.0787353515625,
"learning_rate": 4.6899998662515215e-06,
"loss": 0.0207,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 428.80360412597656,
"epoch": 1.5271317829457365,
"grad_norm": 0.21387967467308044,
"kl": 0.05517578125,
"learning_rate": 4.682576844011007e-06,
"loss": 0.0527,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 445.87501525878906,
"epoch": 1.5426356589147288,
"grad_norm": 0.27787187695503235,
"kl": 0.06585693359375,
"learning_rate": 4.675072734716678e-06,
"loss": 0.0585,
"reward": 0.09107143431901932,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 521.1071701049805,
"epoch": 1.558139534883721,
"grad_norm": 0.2258799970149994,
"kl": 0.0574951171875,
"learning_rate": 4.667487853225931e-06,
"loss": 0.0816,
"reward": 0.08928572200238705,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 500.60716247558594,
"epoch": 1.5736434108527133,
"grad_norm": 0.18799901008605957,
"kl": 0.05963134765625,
"learning_rate": 4.659822517785203e-06,
"loss": 0.0686,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 431.82144927978516,
"epoch": 1.5891472868217056,
"grad_norm": 0.33642229437828064,
"kl": 0.0623779296875,
"learning_rate": 4.6520770500166165e-06,
"loss": 0.022,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 418.5357360839844,
"epoch": 1.6046511627906976,
"grad_norm": 0.17402252554893494,
"kl": 0.05926513671875,
"learning_rate": 4.644251774904487e-06,
"loss": 0.0472,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 341.46429443359375,
"epoch": 1.62015503875969,
"grad_norm": 0.16139744222164154,
"kl": 0.05474853515625,
"learning_rate": 4.636347020781684e-06,
"loss": 0.0078,
"reward": 0.09464286640286446,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 376.16072845458984,
"epoch": 1.6356589147286822,
"grad_norm": 0.20297406613826752,
"kl": 0.16015625,
"learning_rate": 4.6283631193158605e-06,
"loss": -0.0391,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 307.9285888671875,
"epoch": 1.6511627906976745,
"grad_norm": 0.02185610495507717,
"kl": 0.0635986328125,
"learning_rate": 4.620300405495532e-06,
"loss": 0.0006,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 427.39288330078125,
"epoch": 1.6666666666666665,
"grad_norm": 0.3790690004825592,
"kl": 0.08642578125,
"learning_rate": 4.612159217616022e-06,
"loss": 0.0327,
"reward": 0.08750000409781933,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 395.0893020629883,
"epoch": 1.6821705426356588,
"grad_norm": 0.1745668351650238,
"kl": 0.05218505859375,
"learning_rate": 4.603939897265268e-06,
"loss": 0.0428,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 332.05358123779297,
"epoch": 1.697674418604651,
"grad_norm": 0.1193130612373352,
"kl": 0.0545654296875,
"learning_rate": 4.595642789309492e-06,
"loss": 0.0111,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 355.9821548461914,
"epoch": 1.7131782945736433,
"grad_norm": 0.20527540147304535,
"kl": 0.07867431640625,
"learning_rate": 4.587268241878724e-06,
"loss": 0.0454,
"reward": 0.09464286267757416,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 389.0357322692871,
"epoch": 1.7286821705426356,
"grad_norm": 0.08345554023981094,
"kl": 0.06036376953125,
"learning_rate": 4.578816606352205e-06,
"loss": 0.0398,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 408.4643096923828,
"epoch": 1.744186046511628,
"grad_norm": 0.1863243132829666,
"kl": 0.06109619140625,
"learning_rate": 4.570288237343632e-06,
"loss": 0.0374,
"reward": 0.09107143804430962,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 330.6428680419922,
"epoch": 1.7596899224806202,
"grad_norm": 0.03056999109685421,
"kl": 0.0721435546875,
"learning_rate": 4.561683492686289e-06,
"loss": 0.0007,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 331.5714416503906,
"epoch": 1.7751937984496124,
"grad_norm": 0.23037225008010864,
"kl": 0.06451416015625,
"learning_rate": 4.5530027334180285e-06,
"loss": -0.0047,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 312.2678756713867,
"epoch": 1.7906976744186047,
"grad_norm": 0.03742313012480736,
"kl": 0.0576171875,
"learning_rate": 4.544246323766122e-06,
"loss": 0.0006,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 488.35716247558594,
"epoch": 1.806201550387597,
"grad_norm": 0.2483549565076828,
"kl": 0.05865478515625,
"learning_rate": 4.535414631131983e-06,
"loss": 0.036,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 379.8035888671875,
"epoch": 1.8217054263565893,
"grad_norm": 0.30526259541511536,
"kl": 0.08013916015625,
"learning_rate": 4.526508026075746e-06,
"loss": 0.0156,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 341.1428756713867,
"epoch": 1.8372093023255816,
"grad_norm": 0.02339295670390129,
"kl": 0.056884765625,
"learning_rate": 4.517526882300721e-06,
"loss": 0.0006,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 388.2143020629883,
"epoch": 1.8527131782945736,
"grad_norm": 0.3421875834465027,
"kl": 0.0506591796875,
"learning_rate": 4.508471576637713e-06,
"loss": 0.037,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 325.6964416503906,
"epoch": 1.8682170542635659,
"grad_norm": 0.0237069521099329,
"kl": 0.04962158203125,
"learning_rate": 4.499342489029211e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 364.0714416503906,
"epoch": 1.8837209302325582,
"grad_norm": 0.2091287523508072,
"kl": 0.0748291015625,
"learning_rate": 4.490140002513449e-06,
"loss": 0.0171,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 380.4107360839844,
"epoch": 1.8992248062015504,
"grad_norm": 0.07675088196992874,
"kl": 0.05084228515625,
"learning_rate": 4.48086450320833e-06,
"loss": 0.0214,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 329.1428756713867,
"epoch": 1.9147286821705425,
"grad_norm": 0.017835261300206184,
"kl": 0.04571533203125,
"learning_rate": 4.4715163802952266e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 356.7321548461914,
"epoch": 1.9302325581395348,
"grad_norm": 3.949739694595337,
"kl": 0.5897216796875,
"learning_rate": 4.462096026002655e-06,
"loss": 0.0059,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 383.5893020629883,
"epoch": 1.945736434108527,
"grad_norm": 0.06876012682914734,
"kl": 0.05108642578125,
"learning_rate": 4.4526038355898144e-06,
"loss": 0.0192,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 342.75001525878906,
"epoch": 1.9612403100775193,
"grad_norm": 0.01601524092257023,
"kl": 0.0462646484375,
"learning_rate": 4.4430402073300035e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 340.2143020629883,
"epoch": 1.9767441860465116,
"grad_norm": 0.01760284975171089,
"kl": 0.04705810546875,
"learning_rate": 4.433405542493909e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 425.6875,
"epoch": 1.9922480620155039,
"grad_norm": 0.30735570192337036,
"kl": 0.07025146484375,
"learning_rate": 4.4237002453327734e-06,
"loss": -0.0102,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 398.1964569091797,
"epoch": 2.0155038759689923,
"grad_norm": 0.2743019163608551,
"kl": 0.05218505859375,
"learning_rate": 4.4139247230614245e-06,
"loss": 0.012,
"reward": 0.09642857685685158,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 355.48216247558594,
"epoch": 2.0310077519379846,
"grad_norm": 0.02317599020898342,
"kl": 0.05474853515625,
"learning_rate": 4.404079385841201e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 440.14288330078125,
"epoch": 2.046511627906977,
"grad_norm": 0.12499076128005981,
"kl": 0.060546875,
"learning_rate": 4.394164646762734e-06,
"loss": 0.0395,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 349.6607360839844,
"epoch": 2.062015503875969,
"grad_norm": 0.3491656482219696,
"kl": 0.0546875,
"learning_rate": 4.384180921828618e-06,
"loss": -0.0162,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 356.9285888671875,
"epoch": 2.0775193798449614,
"grad_norm": 0.15453527867794037,
"kl": 0.04852294921875,
"learning_rate": 4.374128629935955e-06,
"loss": 0.0289,
"reward": 0.09642857685685158,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 331.9464416503906,
"epoch": 2.0930232558139537,
"grad_norm": 0.016740234568715096,
"kl": 0.0595703125,
"learning_rate": 4.364008192858781e-06,
"loss": 0.0006,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 367.85716247558594,
"epoch": 2.108527131782946,
"grad_norm": 0.021411418914794922,
"kl": 0.052001953125,
"learning_rate": 4.353820035230366e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 333.62501525878906,
"epoch": 2.124031007751938,
"grad_norm": 0.01767720840871334,
"kl": 0.054443359375,
"learning_rate": 4.3435645845254e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 379.8393096923828,
"epoch": 2.13953488372093,
"grad_norm": 0.041266124695539474,
"kl": 0.0701904296875,
"learning_rate": 4.333242271042054e-06,
"loss": 0.0007,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 366.1071548461914,
"epoch": 2.1550387596899223,
"grad_norm": 0.20214009284973145,
"kl": 0.0625,
"learning_rate": 4.32285352788393e-06,
"loss": 0.0047,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 403.25001525878906,
"epoch": 2.1705426356589146,
"grad_norm": 0.14242787659168243,
"kl": 0.0548095703125,
"learning_rate": 4.312398790941882e-06,
"loss": 0.0036,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 381.4464416503906,
"epoch": 2.186046511627907,
"grad_norm": 0.2580341100692749,
"kl": 0.0653076171875,
"learning_rate": 4.301878498875735e-06,
"loss": 0.005,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 327.6428680419922,
"epoch": 2.201550387596899,
"grad_norm": 0.2158711850643158,
"kl": 0.05181884765625,
"learning_rate": 4.291293093095873e-06,
"loss": -0.0069,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 352.7321548461914,
"epoch": 2.2170542635658914,
"grad_norm": 0.12166262418031693,
"kl": 0.0693359375,
"learning_rate": 4.280643017744723e-06,
"loss": 0.0219,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 348.8393020629883,
"epoch": 2.2325581395348837,
"grad_norm": 0.07379510253667831,
"kl": 0.06195068359375,
"learning_rate": 4.269928719678117e-06,
"loss": 0.0219,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 338.75001525878906,
"epoch": 2.248062015503876,
"grad_norm": 0.014747419394552708,
"kl": 0.04913330078125,
"learning_rate": 4.2591506484465426e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 440.62501525878906,
"epoch": 2.2635658914728682,
"grad_norm": 0.09444686770439148,
"kl": 0.0557861328125,
"learning_rate": 4.248309256276283e-06,
"loss": 0.029,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 300.6785888671875,
"epoch": 2.2790697674418605,
"grad_norm": 0.018641607835888863,
"kl": 0.05596923828125,
"learning_rate": 4.23740499805044e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 366.60716247558594,
"epoch": 2.294573643410853,
"grad_norm": 0.06887350976467133,
"kl": 0.04541015625,
"learning_rate": 4.22643833128985e-06,
"loss": 0.0169,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 416.73216247558594,
"epoch": 2.310077519379845,
"grad_norm": 0.06718391925096512,
"kl": 0.05230712890625,
"learning_rate": 4.215409716133885e-06,
"loss": 0.0397,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 381.0000228881836,
"epoch": 2.3255813953488373,
"grad_norm": 0.1807098686695099,
"kl": 0.05438232421875,
"learning_rate": 4.204319615321151e-06,
"loss": -0.0073,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 390.5357360839844,
"epoch": 2.3410852713178296,
"grad_norm": 0.1976163387298584,
"kl": 0.07647705078125,
"learning_rate": 4.193168494170065e-06,
"loss": 0.0157,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 450.6428680419922,
"epoch": 2.356589147286822,
"grad_norm": 0.06315501034259796,
"kl": 0.0489501953125,
"learning_rate": 4.181956820559339e-06,
"loss": 0.0366,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 371.6785888671875,
"epoch": 2.3720930232558137,
"grad_norm": 0.05957993492484093,
"kl": 0.04547119140625,
"learning_rate": 4.170685064908342e-06,
"loss": 0.0189,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 372.7143020629883,
"epoch": 2.387596899224806,
"grad_norm": 0.011611810885369778,
"kl": 0.04443359375,
"learning_rate": 4.159353700157365e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 372.6250228881836,
"epoch": 2.4031007751937983,
"grad_norm": 0.03919665887951851,
"kl": 0.0506591796875,
"learning_rate": 4.14796320174778e-06,
"loss": 0.0189,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 329.1428680419922,
"epoch": 2.4186046511627906,
"grad_norm": 0.0129386056214571,
"kl": 0.0457763671875,
"learning_rate": 4.136514047602087e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 417.12501525878906,
"epoch": 2.434108527131783,
"grad_norm": 0.08656369149684906,
"kl": 0.0552978515625,
"learning_rate": 4.1250067181038635e-06,
"loss": 0.0549,
"reward": 0.09464286454021931,
"reward_std": 0.007576144300401211,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.946428582072258,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 356.2321586608887,
"epoch": 2.449612403100775,
"grad_norm": 0.09912148863077164,
"kl": 0.06451416015625,
"learning_rate": 4.113441696077608e-06,
"loss": 0.0215,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 381.1964416503906,
"epoch": 2.4651162790697674,
"grad_norm": 0.16969716548919678,
"kl": 0.05877685546875,
"learning_rate": 4.101819466768484e-06,
"loss": 0.0143,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 333.3214416503906,
"epoch": 2.4806201550387597,
"grad_norm": 0.011440815404057503,
"kl": 0.04791259765625,
"learning_rate": 4.0901405178219535e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 350.73216247558594,
"epoch": 2.496124031007752,
"grad_norm": 0.036816567182540894,
"kl": 0.04779052734375,
"learning_rate": 4.078405339263326e-06,
"loss": 0.0217,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 361.5535888671875,
"epoch": 2.511627906976744,
"grad_norm": 0.026230594143271446,
"kl": 0.0477294921875,
"learning_rate": 4.06661442347719e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 341.0178680419922,
"epoch": 2.5271317829457365,
"grad_norm": 0.012676913291215897,
"kl": 0.04327392578125,
"learning_rate": 4.054768265186758e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 347.12501525878906,
"epoch": 2.5426356589147288,
"grad_norm": 0.012835390865802765,
"kl": 0.04644775390625,
"learning_rate": 4.0428673614331036e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 398.5178756713867,
"epoch": 2.558139534883721,
"grad_norm": 0.13431993126869202,
"kl": 0.0458984375,
"learning_rate": 4.030912211554316e-06,
"loss": 0.0172,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 382.1607208251953,
"epoch": 2.5736434108527133,
"grad_norm": 0.1748451292514801,
"kl": 0.04718017578125,
"learning_rate": 4.018903317164539e-06,
"loss": 0.0086,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 461.58931732177734,
"epoch": 2.5891472868217056,
"grad_norm": 0.2945731580257416,
"kl": 0.0743408203125,
"learning_rate": 4.006841182132932e-06,
"loss": 0.0594,
"reward": 0.09107143431901932,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 396.3393096923828,
"epoch": 2.604651162790698,
"grad_norm": 0.04062338173389435,
"kl": 0.0452880859375,
"learning_rate": 3.9947263125625195e-06,
"loss": 0.0198,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 416.41072845458984,
"epoch": 2.62015503875969,
"grad_norm": 136.00784301757812,
"kl": 1.88348388671875,
"learning_rate": 3.982559216768967e-06,
"loss": 0.07,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 289.2678680419922,
"epoch": 2.6356589147286824,
"grad_norm": 0.021940352395176888,
"kl": 0.056640625,
"learning_rate": 3.970340405259245e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 371.0178756713867,
"epoch": 2.6511627906976747,
"grad_norm": 0.24385367333889008,
"kl": 0.12432861328125,
"learning_rate": 3.958070390710214e-06,
"loss": 0.0023,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 343.51787185668945,
"epoch": 2.6666666666666665,
"grad_norm": 0.048726681619882584,
"kl": 0.0433349609375,
"learning_rate": 3.945749687947109e-06,
"loss": 0.0171,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 370.1607360839844,
"epoch": 2.682170542635659,
"grad_norm": 0.16867610812187195,
"kl": 0.06219482421875,
"learning_rate": 3.933378813921942e-06,
"loss": -0.0057,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 349.2857360839844,
"epoch": 2.697674418604651,
"grad_norm": 0.23669791221618652,
"kl": 0.11749267578125,
"learning_rate": 3.920958287691811e-06,
"loss": -0.0063,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 339.57144927978516,
"epoch": 2.7131782945736433,
"grad_norm": 0.04533864185214043,
"kl": 0.0574951171875,
"learning_rate": 3.908488630397121e-06,
"loss": 0.0006,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 309.00001525878906,
"epoch": 2.7286821705426356,
"grad_norm": 0.08899213373661041,
"kl": 0.05792236328125,
"learning_rate": 3.8959703652397175e-06,
"loss": 0.0087,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 377.30359649658203,
"epoch": 2.744186046511628,
"grad_norm": 0.05321886017918587,
"kl": 0.05023193359375,
"learning_rate": 3.883404017460935e-06,
"loss": 0.0179,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 440.39288330078125,
"epoch": 2.75968992248062,
"grad_norm": 0.07496553659439087,
"kl": 0.0521240234375,
"learning_rate": 3.870790114319559e-06,
"loss": 0.0422,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 424.0357360839844,
"epoch": 2.7751937984496124,
"grad_norm": 0.24973690509796143,
"kl": 0.05303955078125,
"learning_rate": 3.858129185069701e-06,
"loss": 0.0246,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 346.25001525878906,
"epoch": 2.7906976744186047,
"grad_norm": 0.048278991132974625,
"kl": 0.07000732421875,
"learning_rate": 3.845421760938597e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 346.9464416503906,
"epoch": 2.806201550387597,
"grad_norm": 0.09487508982419968,
"kl": 0.07293701171875,
"learning_rate": 3.832668375104312e-06,
"loss": 0.0159,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 357.0535888671875,
"epoch": 2.8217054263565893,
"grad_norm": 0.013098032213747501,
"kl": 0.04278564453125,
"learning_rate": 3.8198695626733725e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 311.6607360839844,
"epoch": 2.8372093023255816,
"grad_norm": 0.013701863586902618,
"kl": 0.0498046875,
"learning_rate": 3.8070258606583156e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 418.73216247558594,
"epoch": 2.8527131782945734,
"grad_norm": 0.0741962194442749,
"kl": 0.06573486328125,
"learning_rate": 3.7941378079551544e-06,
"loss": 0.0418,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 438.21429443359375,
"epoch": 2.8682170542635657,
"grad_norm": 0.08349604904651642,
"kl": 0.05194091796875,
"learning_rate": 3.7812059453207677e-06,
"loss": 0.0336,
"reward": 0.09642857685685158,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 491.51788330078125,
"epoch": 2.883720930232558,
"grad_norm": 0.17697231471538544,
"kl": 0.05194091796875,
"learning_rate": 3.768230815350213e-06,
"loss": 0.0333,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 363.3571548461914,
"epoch": 2.89922480620155,
"grad_norm": 0.012233450077474117,
"kl": 0.0384521484375,
"learning_rate": 3.7552129624539557e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 374.66072845458984,
"epoch": 2.9147286821705425,
"grad_norm": 0.09323134273290634,
"kl": 0.04193115234375,
"learning_rate": 3.7421529328350316e-06,
"loss": 0.018,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 343.25001525878906,
"epoch": 2.9302325581395348,
"grad_norm": 0.17554545402526855,
"kl": 0.05010986328125,
"learning_rate": 3.7290512744661274e-06,
"loss": -0.0053,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 346.7857208251953,
"epoch": 2.945736434108527,
"grad_norm": 0.012803681194782257,
"kl": 0.041259765625,
"learning_rate": 3.715908537066589e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 393.5357360839844,
"epoch": 2.9612403100775193,
"grad_norm": 0.048317961394786835,
"kl": 0.04132080078125,
"learning_rate": 3.7027252720793538e-06,
"loss": 0.016,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 422.7143096923828,
"epoch": 2.9767441860465116,
"grad_norm": 0.6855605244636536,
"kl": 0.22027587890625,
"learning_rate": 3.689502032647817e-06,
"loss": 0.0154,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 356.15625,
"epoch": 2.992248062015504,
"grad_norm": 0.014512370340526104,
"kl": 0.04461669921875,
"learning_rate": 3.6762393735926245e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 416.5893020629883,
"epoch": 3.0155038759689923,
"grad_norm": 0.39832741022109985,
"kl": 0.10870361328125,
"learning_rate": 3.6629378513883852e-06,
"loss": 0.0074,
"reward": 0.09107143618166447,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 366.1964416503906,
"epoch": 3.0310077519379846,
"grad_norm": 0.19380412995815277,
"kl": 0.048828125,
"learning_rate": 3.6495980241403307e-06,
"loss": -0.0012,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 428.7321548461914,
"epoch": 3.046511627906977,
"grad_norm": 0.2720411717891693,
"kl": 0.0596923828125,
"learning_rate": 3.636220451560896e-06,
"loss": 0.0191,
"reward": 0.09107143431901932,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 377.3035888671875,
"epoch": 3.062015503875969,
"grad_norm": 0.06497833132743835,
"kl": 0.0489501953125,
"learning_rate": 3.622805694946235e-06,
"loss": 0.013,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 473.8035888671875,
"epoch": 3.0775193798449614,
"grad_norm": 0.016108330339193344,
"kl": 0.04351806640625,
"learning_rate": 3.609354317152667e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 443.94644927978516,
"epoch": 3.0930232558139537,
"grad_norm": 9.35707950592041,
"kl": 0.8321533203125,
"learning_rate": 3.595866882573063e-06,
"loss": 0.0221,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 404.50001525878906,
"epoch": 3.108527131782946,
"grad_norm": 0.013096613809466362,
"kl": 0.03851318359375,
"learning_rate": 3.5823439571131675e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 449.2678756713867,
"epoch": 3.124031007751938,
"grad_norm": 0.14225821197032928,
"kl": 0.04180908203125,
"learning_rate": 3.5687861081678477e-06,
"loss": 0.0035,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 483.92859649658203,
"epoch": 3.13953488372093,
"grad_norm": 0.18597643077373505,
"kl": 0.045745849609375,
"learning_rate": 3.555193904597291e-06,
"loss": 0.0368,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 385.9643096923828,
"epoch": 3.1550387596899223,
"grad_norm": 0.29447314143180847,
"kl": 0.12945556640625,
"learning_rate": 3.541567916703138e-06,
"loss": -0.006,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 453.83931732177734,
"epoch": 3.1705426356589146,
"grad_norm": 0.1709349900484085,
"kl": 0.0709228515625,
"learning_rate": 3.5279087162045517e-06,
"loss": 0.0165,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 423.6964416503906,
"epoch": 3.186046511627907,
"grad_norm": 0.28060221672058105,
"kl": 0.04339599609375,
"learning_rate": 3.5142168762142265e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 383.78572845458984,
"epoch": 3.201550387596899,
"grad_norm": 0.06871840357780457,
"kl": 0.037109375,
"learning_rate": 3.500492971214347e-06,
"loss": 0.0126,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 422.1964569091797,
"epoch": 3.2170542635658914,
"grad_norm": 0.15468138456344604,
"kl": 0.04852294921875,
"learning_rate": 3.48673757703248e-06,
"loss": 0.0165,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 365.0893020629883,
"epoch": 3.2325581395348837,
"grad_norm": 0.18128041923046112,
"kl": 0.04852294921875,
"learning_rate": 3.472951270817418e-06,
"loss": -0.002,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 370.8393096923828,
"epoch": 3.248062015503876,
"grad_norm": 0.05891520529985428,
"kl": 0.0443115234375,
"learning_rate": 3.4591346310149578e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 392.67859649658203,
"epoch": 3.2635658914728682,
"grad_norm": 0.03135214000940323,
"kl": 0.04595947265625,
"learning_rate": 3.445288237343632e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 481.7143096923828,
"epoch": 3.2790697674418605,
"grad_norm": 0.07085608690977097,
"kl": 0.04840087890625,
"learning_rate": 3.4314126707703895e-06,
"loss": 0.0141,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 439.2678756713867,
"epoch": 3.294573643410853,
"grad_norm": 0.2375049591064453,
"kl": 0.04571533203125,
"learning_rate": 3.4175085134862128e-06,
"loss": 0.0463,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 446.10716247558594,
"epoch": 3.310077519379845,
"grad_norm": 0.11966075003147125,
"kl": 0.0457763671875,
"learning_rate": 3.4035763488816953e-06,
"loss": 0.0118,
"reward": 0.09464286454021931,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 461.8393096923828,
"epoch": 3.3255813953488373,
"grad_norm": 0.22044996917247772,
"kl": 0.05328369140625,
"learning_rate": 3.3896167615225594e-06,
"loss": 0.003,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 445.7857360839844,
"epoch": 3.3410852713178296,
"grad_norm": 0.12458500266075134,
"kl": 0.0550537109375,
"learning_rate": 3.375630337125133e-06,
"loss": 0.0142,
"reward": 0.09464286640286446,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 409.30358123779297,
"epoch": 3.356589147286822,
"grad_norm": 0.4201054871082306,
"kl": 0.03924560546875,
"learning_rate": 3.361617662531772e-06,
"loss": 0.0185,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 341.03572845458984,
"epoch": 3.3720930232558137,
"grad_norm": 0.022784234955906868,
"kl": 0.04339599609375,
"learning_rate": 3.347579325686237e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 482.7321472167969,
"epoch": 3.387596899224806,
"grad_norm": 15.430904388427734,
"kl": 4.7401123046875,
"learning_rate": 3.333515915609027e-06,
"loss": 0.1214,
"reward": 0.09107143431901932,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 391.1607360839844,
"epoch": 3.4031007751937983,
"grad_norm": 2.3437180519104004,
"kl": 0.5184326171875,
"learning_rate": 3.3194280223726616e-06,
"loss": 0.0116,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 383.30358123779297,
"epoch": 3.4186046511627906,
"grad_norm": 1.4211961030960083,
"kl": 0.56317138671875,
"learning_rate": 3.305316237076927e-06,
"loss": 0.0405,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 390.73216247558594,
"epoch": 3.434108527131783,
"grad_norm": 11.418591499328613,
"kl": 1.92529296875,
"learning_rate": 3.291181151824071e-06,
"loss": 0.0215,
"reward": 0.09107143618166447,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 427.0000305175781,
"epoch": 3.449612403100775,
"grad_norm": 4.976629734039307,
"kl": 0.14874267578125,
"learning_rate": 3.27702335969396e-06,
"loss": 0.0572,
"reward": 0.0892857201397419,
"reward_std": 0.015152288600802422,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 427.9285888671875,
"epoch": 3.4651162790697674,
"grad_norm": 0.43471699953079224,
"kl": 0.1776123046875,
"learning_rate": 3.2628434547191985e-06,
"loss": -0.0052,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 348.7321548461914,
"epoch": 3.4806201550387597,
"grad_norm": 0.029118988662958145,
"kl": 0.04644775390625,
"learning_rate": 3.2486420318601973e-06,
"loss": 0.0005,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 368.0714416503906,
"epoch": 3.496124031007752,
"grad_norm": 0.1969047337770462,
"kl": 0.06158447265625,
"learning_rate": 3.2344196869802187e-06,
"loss": 0.0168,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 359.48217010498047,
"epoch": 3.511627906976744,
"grad_norm": 0.05316058173775673,
"kl": 0.04388427734375,
"learning_rate": 3.2201770168203694e-06,
"loss": 0.0004,
"reward": 0.10000000894069672,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 416.55358123779297,
"epoch": 3.5271317829457365,
"grad_norm": 1.583228349685669,
"kl": 0.6258544921875,
"learning_rate": 3.205914618974563e-06,
"loss": 0.0457,
"reward": 0.08750000596046448,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 396.3214416503906,
"epoch": 3.5426356589147288,
"grad_norm": 0.8248907923698425,
"kl": 0.10003662109375,
"learning_rate": 3.1916330918644496e-06,
"loss": 0.0307,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 430.0535888671875,
"epoch": 3.558139534883721,
"grad_norm": 0.37740814685821533,
"kl": 0.19354248046875,
"learning_rate": 3.177333034714303e-06,
"loss": 0.0462,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 360.2321548461914,
"epoch": 3.5736434108527133,
"grad_norm": 0.2104271948337555,
"kl": 0.05908203125,
"learning_rate": 3.1630150475258813e-06,
"loss": 0.0171,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 482.6964569091797,
"epoch": 3.5891472868217056,
"grad_norm": 3.0259857177734375,
"kl": 2.01220703125,
"learning_rate": 3.148679731053252e-06,
"loss": 0.0681,
"reward": 0.09107143431901932,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 454.9821548461914,
"epoch": 3.604651162790698,
"grad_norm": 0.9339037537574768,
"kl": 0.580322265625,
"learning_rate": 3.1343276867775805e-06,
"loss": 0.0536,
"reward": 0.08571429178118706,
"reward_std": 0.015152288600802422,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 459.0178756713867,
"epoch": 3.62015503875969,
"grad_norm": 1.466651201248169,
"kl": 0.37054443359375,
"learning_rate": 3.1199595168819043e-06,
"loss": 0.0784,
"reward": 0.0892857164144516,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 436.00001525878906,
"epoch": 3.6356589147286824,
"grad_norm": 5.144193649291992,
"kl": 0.2822265625,
"learning_rate": 3.105575824225852e-06,
"loss": 0.0738,
"reward": 0.08750000968575478,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000596046448,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 389.12501525878906,
"epoch": 3.6511627906976747,
"grad_norm": 2.5096967220306396,
"kl": 0.32818603515625,
"learning_rate": 3.091177212320363e-06,
"loss": 0.0412,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 323.6071548461914,
"epoch": 3.6666666666666665,
"grad_norm": 0.6911299824714661,
"kl": 0.5263671875,
"learning_rate": 3.0767642853023538e-06,
"loss": -0.0336,
"reward": 0.09285715036094189,
"reward_std": 0.010101525811478496,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714477300644,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 446.2143020629883,
"epoch": 3.682170542635659,
"grad_norm": 5.997923374176025,
"kl": 4.308349609375,
"learning_rate": 3.062337647909376e-06,
"loss": 0.0867,
"reward": 0.0892857201397419,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 362.3214416503906,
"epoch": 3.697674418604651,
"grad_norm": 4.273003578186035,
"kl": 2.62890625,
"learning_rate": 3.04789790545424e-06,
"loss": 0.0346,
"reward": 0.0892857238650322,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 436.6250228881836,
"epoch": 3.7131782945736433,
"grad_norm": 0.8113691210746765,
"kl": 1.3873291015625,
"learning_rate": 3.033445663799621e-06,
"loss": 0.0157,
"reward": 0.08750000596046448,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 365.8214454650879,
"epoch": 3.7286821705426356,
"grad_norm": 6.4786882400512695,
"kl": 0.678466796875,
"learning_rate": 3.018981529332633e-06,
"loss": 0.0158,
"reward": 0.0892857201397419,
"reward_std": 0.015152289299294353,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 359.2678642272949,
"epoch": 3.744186046511628,
"grad_norm": 2.5377206802368164,
"kl": 1.3226318359375,
"learning_rate": 3.00450610893939e-06,
"loss": 0.0097,
"reward": 0.09107143245637417,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 362.1428680419922,
"epoch": 3.75968992248062,
"grad_norm": 4.049387454986572,
"kl": 0.501953125,
"learning_rate": 2.9900200099795396e-06,
"loss": 0.0179,
"reward": 0.09107143245637417,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 388.17859649658203,
"epoch": 3.7751937984496124,
"grad_norm": 7.053500175476074,
"kl": 4.607421875,
"learning_rate": 2.9755238402607826e-06,
"loss": 0.056,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 320.6250114440918,
"epoch": 3.7906976744186047,
"grad_norm": 3.405297040939331,
"kl": 3.1016845703125,
"learning_rate": 2.961018208013367e-06,
"loss": 0.0188,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 316.2143020629883,
"epoch": 3.806201550387597,
"grad_norm": 0.8333543539047241,
"kl": 0.9293212890625,
"learning_rate": 2.9465037218645694e-06,
"loss": 0.0027,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 415.42859268188477,
"epoch": 3.8217054263565893,
"grad_norm": 0.9986127018928528,
"kl": 2.35595703125,
"learning_rate": 2.9319809908131604e-06,
"loss": -0.0196,
"reward": 0.08392857946455479,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 371.17859649658203,
"epoch": 3.8372093023255816,
"grad_norm": 0.5152451395988464,
"kl": 1.0821533203125,
"learning_rate": 2.917450624203847e-06,
"loss": 0.0239,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 370.41072845458984,
"epoch": 3.8527131782945734,
"grad_norm": 0.3054462671279907,
"kl": 0.1796875,
"learning_rate": 2.9029132317017118e-06,
"loss": 0.0193,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 312.46429443359375,
"epoch": 3.8682170542635657,
"grad_norm": 0.3965552747249603,
"kl": 0.6539306640625,
"learning_rate": 2.888369423266629e-06,
"loss": 0.0087,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 296.3571548461914,
"epoch": 3.883720930232558,
"grad_norm": 1.3105534315109253,
"kl": 0.5010986328125,
"learning_rate": 2.8738198091276712e-06,
"loss": -0.0163,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 282.8035888671875,
"epoch": 3.89922480620155,
"grad_norm": 0.4267464876174927,
"kl": 0.328857421875,
"learning_rate": 2.859264999757509e-06,
"loss": 0.0033,
"reward": 0.0982142947614193,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 334.2143020629883,
"epoch": 3.9147286821705425,
"grad_norm": 0.5196983814239502,
"kl": 0.22705078125,
"learning_rate": 2.8447056058467928e-06,
"loss": 0.041,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 390.60717010498047,
"epoch": 3.9302325581395348,
"grad_norm": 2.4938759803771973,
"kl": 2.436279296875,
"learning_rate": 2.830142238278531e-06,
"loss": 0.0643,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 369.7143020629883,
"epoch": 3.945736434108527,
"grad_norm": 19.98564910888672,
"kl": 3.283447265625,
"learning_rate": 2.81557550810246e-06,
"loss": 0.1163,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 251.08929824829102,
"epoch": 3.9612403100775193,
"grad_norm": 0.8202245831489563,
"kl": 1.39990234375,
"learning_rate": 2.8010060265094026e-06,
"loss": -0.0223,
"reward": 0.09642857685685158,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 405.80359649658203,
"epoch": 3.9767441860465116,
"grad_norm": 0.16511620581150055,
"kl": 0.1580810546875,
"learning_rate": 2.786434404805629e-06,
"loss": 0.0429,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 435.65625,
"epoch": 3.992248062015504,
"grad_norm": 0.6317914724349976,
"kl": 1.050048828125,
"learning_rate": 2.771861254387199e-06,
"loss": 0.0243,
"reward": 0.09107143618166447,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 387.25000762939453,
"epoch": 4.015503875968992,
"grad_norm": 2.031557321548462,
"kl": 2.938232421875,
"learning_rate": 2.7572871867143204e-06,
"loss": 0.0425,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 363.5535888671875,
"epoch": 4.0310077519379846,
"grad_norm": 0.5578765273094177,
"kl": 0.310546875,
"learning_rate": 2.742712813285681e-06,
"loss": 0.0663,
"reward": 0.09285714663565159,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 380.5178756713867,
"epoch": 4.046511627906977,
"grad_norm": 0.3609902858734131,
"kl": 0.613037109375,
"learning_rate": 2.7281387456128017e-06,
"loss": 0.0103,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 431.25001525878906,
"epoch": 4.062015503875969,
"grad_norm": 0.13468655943870544,
"kl": 0.6370849609375,
"learning_rate": 2.7135655951943716e-06,
"loss": 0.0617,
"reward": 0.09107143431901932,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 424.642879486084,
"epoch": 4.077519379844961,
"grad_norm": 0.7959390878677368,
"kl": 1.122314453125,
"learning_rate": 2.698993973490598e-06,
"loss": 0.04,
"reward": 0.0892857201397419,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 325.5178756713867,
"epoch": 4.093023255813954,
"grad_norm": 0.541179895401001,
"kl": 0.6156005859375,
"learning_rate": 2.6844244918975416e-06,
"loss": 0.009,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 484.4643096923828,
"epoch": 4.108527131782946,
"grad_norm": 0.6031014919281006,
"kl": 0.8077392578125,
"learning_rate": 2.66985776172147e-06,
"loss": 0.0724,
"reward": 0.08750000596046448,
"reward_std": 0.017677670810371637,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 322.8035888671875,
"epoch": 4.124031007751938,
"grad_norm": 0.9272570610046387,
"kl": 0.213623046875,
"learning_rate": 2.6552943941532088e-06,
"loss": 0.014,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 353.35716247558594,
"epoch": 4.1395348837209305,
"grad_norm": 0.8877429366111755,
"kl": 0.3662109375,
"learning_rate": 2.6407350002424927e-06,
"loss": 0.0444,
"reward": 0.09464286267757416,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 388.8571548461914,
"epoch": 4.155038759689923,
"grad_norm": 0.2517595887184143,
"kl": 0.248046875,
"learning_rate": 2.626180190872329e-06,
"loss": 0.0563,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 397.2857360839844,
"epoch": 4.170542635658915,
"grad_norm": 0.45230913162231445,
"kl": 0.7420654296875,
"learning_rate": 2.611630576733372e-06,
"loss": 0.0146,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 352.67858123779297,
"epoch": 4.186046511627907,
"grad_norm": 4.2593770027160645,
"kl": 4.50927734375,
"learning_rate": 2.5970867682982885e-06,
"loss": 0.068,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 337.42859268188477,
"epoch": 4.2015503875969,
"grad_norm": 7.50376033782959,
"kl": 10.3663330078125,
"learning_rate": 2.582549375796154e-06,
"loss": 0.0469,
"reward": 0.0892857201397419,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 367.7143135070801,
"epoch": 4.217054263565892,
"grad_norm": 0.7440657615661621,
"kl": 0.711181640625,
"learning_rate": 2.568019009186841e-06,
"loss": 0.0718,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 337.96429443359375,
"epoch": 4.232558139534884,
"grad_norm": 0.491629421710968,
"kl": 1.0819091796875,
"learning_rate": 2.5534962781354317e-06,
"loss": 0.039,
"reward": 0.09464286454021931,
"reward_std": 0.007576144300401211,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.946428582072258,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 377.5893020629883,
"epoch": 4.248062015503876,
"grad_norm": 0.3443954885005951,
"kl": 0.227294921875,
"learning_rate": 2.538981791986634e-06,
"loss": 0.0389,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 383.6428680419922,
"epoch": 4.263565891472869,
"grad_norm": 1.2458945512771606,
"kl": 0.91650390625,
"learning_rate": 2.524476159739218e-06,
"loss": 0.0354,
"reward": 0.08928572200238705,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 354.92858123779297,
"epoch": 4.27906976744186,
"grad_norm": 1.8443901538848877,
"kl": 1.0614013671875,
"learning_rate": 2.5099799900204607e-06,
"loss": 0.0072,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 440.35716247558594,
"epoch": 4.294573643410852,
"grad_norm": 0.3539612293243408,
"kl": 0.1920166015625,
"learning_rate": 2.4954938910606108e-06,
"loss": 0.0495,
"reward": 0.0892857201397419,
"reward_std": 0.010101525811478496,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 407.6785888671875,
"epoch": 4.310077519379845,
"grad_norm": 0.10979936271905899,
"kl": 0.1221923828125,
"learning_rate": 2.481018470667368e-06,
"loss": 0.0797,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 408.23216247558594,
"epoch": 4.325581395348837,
"grad_norm": 1.0740007162094116,
"kl": 1.7060546875,
"learning_rate": 2.4665543362003802e-06,
"loss": 0.057,
"reward": 0.0857142936438322,
"reward_std": 0.020203052321448922,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 389.9643020629883,
"epoch": 4.341085271317829,
"grad_norm": 0.8059707880020142,
"kl": 1.093017578125,
"learning_rate": 2.4521020945457615e-06,
"loss": 0.0484,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 458.00001525878906,
"epoch": 4.3565891472868215,
"grad_norm": 3.3947207927703857,
"kl": 0.936279296875,
"learning_rate": 2.4376623520906255e-06,
"loss": 0.1075,
"reward": 0.08571428991854191,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428805589676,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 359.9821548461914,
"epoch": 4.372093023255814,
"grad_norm": 0.2123708724975586,
"kl": 0.177001953125,
"learning_rate": 2.4232357146976478e-06,
"loss": 0.0428,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 373.87501525878906,
"epoch": 4.387596899224806,
"grad_norm": 1.0469727516174316,
"kl": 2.60693359375,
"learning_rate": 2.408822787679637e-06,
"loss": 0.0565,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 300.5714416503906,
"epoch": 4.403100775193798,
"grad_norm": 0.21965286135673523,
"kl": 0.1136474609375,
"learning_rate": 2.3944241757741475e-06,
"loss": 0.0211,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 497.08931732177734,
"epoch": 4.4186046511627906,
"grad_norm": 0.4067678451538086,
"kl": 1.36474609375,
"learning_rate": 2.380040483118097e-06,
"loss": 0.1029,
"reward": 0.08571429178118706,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 493.3928756713867,
"epoch": 4.434108527131783,
"grad_norm": 0.29808875918388367,
"kl": 0.3673095703125,
"learning_rate": 2.365672313222419e-06,
"loss": 0.0918,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 578.4643249511719,
"epoch": 4.449612403100775,
"grad_norm": 0.47009024024009705,
"kl": 0.503173828125,
"learning_rate": 2.351320268946749e-06,
"loss": 0.1227,
"reward": 0.08214286155998707,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214285969734192,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 467.6071548461914,
"epoch": 4.465116279069767,
"grad_norm": 1.2561384439468384,
"kl": 0.22412109375,
"learning_rate": 2.336984952474119e-06,
"loss": 0.067,
"reward": 0.08928572200238705,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 370.3214416503906,
"epoch": 4.48062015503876,
"grad_norm": 0.14166420698165894,
"kl": 0.16064453125,
"learning_rate": 2.322666965285697e-06,
"loss": 0.0425,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 761.8036117553711,
"epoch": 4.496124031007752,
"grad_norm": 0.33948689699172974,
"kl": 0.207275390625,
"learning_rate": 2.3083669081355507e-06,
"loss": 0.1666,
"reward": 0.07321429066359997,
"reward_std": 0.027779196621850133,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7321428805589676,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 745.7678985595703,
"epoch": 4.511627906976744,
"grad_norm": 0.9462884068489075,
"kl": 1.53857421875,
"learning_rate": 2.2940853810254377e-06,
"loss": 0.1113,
"reward": 0.06964286044239998,
"reward_std": 0.022728433134034276,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6964285969734192,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 822.5893249511719,
"epoch": 4.5271317829457365,
"grad_norm": 0.6085572838783264,
"kl": 0.3189697265625,
"learning_rate": 2.2798229831796313e-06,
"loss": 0.2599,
"reward": 0.07321429066359997,
"reward_std": 0.037880722898989916,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7321428954601288,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 993.9286041259766,
"epoch": 4.542635658914729,
"grad_norm": 0.5242969393730164,
"kl": 0.3759765625,
"learning_rate": 2.2655803130197816e-06,
"loss": 0.1781,
"reward": 0.057142860256135464,
"reward_std": 0.025253815110772848,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5714286118745804,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 886.1607513427734,
"epoch": 4.558139534883721,
"grad_norm": 0.3978789150714874,
"kl": 0.3583984375,
"learning_rate": 2.2513579681398034e-06,
"loss": 0.188,
"reward": 0.06964286044239998,
"reward_std": 0.027779195923358202,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6964286118745804,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 694.1607513427734,
"epoch": 4.573643410852713,
"grad_norm": 1.5433213710784912,
"kl": 0.54296875,
"learning_rate": 2.237156545280803e-06,
"loss": 0.1312,
"reward": 0.07857143133878708,
"reward_std": 0.020203052321448922,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.785714328289032,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 935.6965026855469,
"epoch": 4.589147286821706,
"grad_norm": 0.7737842798233032,
"kl": 0.349609375,
"learning_rate": 2.2229766403060403e-06,
"loss": 0.1306,
"reward": 0.06607143394649029,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.660714328289032,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 854.2143249511719,
"epoch": 4.604651162790698,
"grad_norm": 0.2729988992214203,
"kl": 0.80322265625,
"learning_rate": 2.2088188481759305e-06,
"loss": 0.1527,
"reward": 0.06428571976721287,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6428571790456772,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 559.0178833007812,
"epoch": 4.62015503875969,
"grad_norm": 0.42360585927963257,
"kl": 0.226318359375,
"learning_rate": 2.194683762923073e-06,
"loss": 0.1256,
"reward": 0.08214286342263222,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 557.4107437133789,
"epoch": 4.635658914728682,
"grad_norm": 1.7277472019195557,
"kl": 1.413330078125,
"learning_rate": 2.1805719776273387e-06,
"loss": 0.1408,
"reward": 0.08035714738070965,
"reward_std": 0.02272843336686492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714626312256,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 587.2857513427734,
"epoch": 4.651162790697675,
"grad_norm": 0.5576246976852417,
"kl": 0.68017578125,
"learning_rate": 2.166484084390974e-06,
"loss": 0.1176,
"reward": 0.0803571492433548,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 607.6607666015625,
"epoch": 4.666666666666667,
"grad_norm": 0.4789409339427948,
"kl": 0.2098388671875,
"learning_rate": 2.1524206743137636e-06,
"loss": 0.1325,
"reward": 0.08392857946455479,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 360.62500762939453,
"epoch": 4.682170542635659,
"grad_norm": 0.17774701118469238,
"kl": 0.1337890625,
"learning_rate": 2.1383823374682287e-06,
"loss": 0.0556,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 371.44644927978516,
"epoch": 4.6976744186046515,
"grad_norm": 29.56537437438965,
"kl": 11.70361328125,
"learning_rate": 2.124369662874868e-06,
"loss": 0.1786,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 487.1785888671875,
"epoch": 4.713178294573644,
"grad_norm": 141.20318603515625,
"kl": 45.4642333984375,
"learning_rate": 2.110383238477441e-06,
"loss": 0.524,
"reward": 0.08571428991854191,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428805589676,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 510.6428756713867,
"epoch": 4.728682170542635,
"grad_norm": 0.6217575669288635,
"kl": 1.52783203125,
"learning_rate": 2.096423651118305e-06,
"loss": 0.1348,
"reward": 0.0857142936438322,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 591.4107360839844,
"epoch": 4.7441860465116275,
"grad_norm": 0.8732028007507324,
"kl": 1.2354736328125,
"learning_rate": 2.082491486513788e-06,
"loss": 0.1306,
"reward": 0.07678571902215481,
"reward_std": 0.022728433134034276,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7678571790456772,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 469.5357437133789,
"epoch": 4.75968992248062,
"grad_norm": 0.8003130555152893,
"kl": 1.0810546875,
"learning_rate": 2.0685873292296116e-06,
"loss": 0.1155,
"reward": 0.08750000409781933,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 462.5000305175781,
"epoch": 4.775193798449612,
"grad_norm": 4.2348222732543945,
"kl": 3.279296875,
"learning_rate": 2.054711762656369e-06,
"loss": 0.0886,
"reward": 0.08571428991854191,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 433.16072845458984,
"epoch": 4.790697674418604,
"grad_norm": 0.15781785547733307,
"kl": 0.1988525390625,
"learning_rate": 2.040865368985044e-06,
"loss": 0.0994,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 313.6250114440918,
"epoch": 4.8062015503875966,
"grad_norm": 0.2198803722858429,
"kl": 0.2431640625,
"learning_rate": 2.027048729182583e-06,
"loss": 0.0431,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 382.94644927978516,
"epoch": 4.821705426356589,
"grad_norm": 0.25655123591423035,
"kl": 0.217529296875,
"learning_rate": 2.0132624229675205e-06,
"loss": 0.0808,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 344.1071548461914,
"epoch": 4.837209302325581,
"grad_norm": 0.1982196867465973,
"kl": 0.136962890625,
"learning_rate": 1.9995070287856546e-06,
"loss": 0.0639,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 407.67858123779297,
"epoch": 4.852713178294573,
"grad_norm": 0.3460334241390228,
"kl": 0.26708984375,
"learning_rate": 1.985783123785774e-06,
"loss": 0.0776,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 366.7143020629883,
"epoch": 4.868217054263566,
"grad_norm": 0.31413528323173523,
"kl": 0.273681640625,
"learning_rate": 1.9720912837954486e-06,
"loss": 0.0568,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 355.3750228881836,
"epoch": 4.883720930232558,
"grad_norm": 1.5121686458587646,
"kl": 0.6146240234375,
"learning_rate": 1.958432083296862e-06,
"loss": 0.0288,
"reward": 0.09285715036094189,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 442.3393096923828,
"epoch": 4.89922480620155,
"grad_norm": 0.1976187527179718,
"kl": 0.233154296875,
"learning_rate": 1.9448060954027093e-06,
"loss": 0.1026,
"reward": 0.08750000596046448,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 450.5357437133789,
"epoch": 4.9147286821705425,
"grad_norm": 0.8077256083488464,
"kl": 0.3204345703125,
"learning_rate": 1.931213891832153e-06,
"loss": 0.1048,
"reward": 0.0892857201397419,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 416.73216247558594,
"epoch": 4.930232558139535,
"grad_norm": 0.5839415192604065,
"kl": 0.266357421875,
"learning_rate": 1.9176560428868336e-06,
"loss": 0.0852,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 423.80358505249023,
"epoch": 4.945736434108527,
"grad_norm": 0.23957200348377228,
"kl": 0.19775390625,
"learning_rate": 1.9041331174269373e-06,
"loss": 0.0808,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 543.5000228881836,
"epoch": 4.961240310077519,
"grad_norm": 0.5168282985687256,
"kl": 0.46875,
"learning_rate": 1.8906456828473341e-06,
"loss": 0.1104,
"reward": 0.08571429178118706,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 430.4285888671875,
"epoch": 4.976744186046512,
"grad_norm": 0.32360389828681946,
"kl": 0.426513671875,
"learning_rate": 1.8771943050537656e-06,
"loss": 0.0953,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 309.8125,
"epoch": 4.992248062015504,
"grad_norm": 0.24179396033287048,
"kl": 0.2115478515625,
"learning_rate": 1.8637795484391046e-06,
"loss": 0.04,
"reward": 0.09285715036094189,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714328289032,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 586.9464569091797,
"epoch": 5.015503875968992,
"grad_norm": 0.818027675151825,
"kl": 0.38525390625,
"learning_rate": 1.8504019758596698e-06,
"loss": 0.1226,
"reward": 0.08392857760190964,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857313156128,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 378.0357246398926,
"epoch": 5.0310077519379846,
"grad_norm": 0.2399987429380417,
"kl": 0.15380859375,
"learning_rate": 1.8370621486116163e-06,
"loss": 0.0592,
"reward": 0.09464286267757416,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 547.6607513427734,
"epoch": 5.046511627906977,
"grad_norm": 0.6719325184822083,
"kl": 0.46630859375,
"learning_rate": 1.823760626407377e-06,
"loss": 0.1547,
"reward": 0.08571429178118706,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 534.8928833007812,
"epoch": 5.062015503875969,
"grad_norm": 1.7380086183547974,
"kl": 0.52978515625,
"learning_rate": 1.8104979673521838e-06,
"loss": 0.1056,
"reward": 0.08571429178118706,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 476.76788330078125,
"epoch": 5.077519379844961,
"grad_norm": 0.3788207173347473,
"kl": 0.2706298828125,
"learning_rate": 1.7972747279206482e-06,
"loss": 0.0984,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 497.48216247558594,
"epoch": 5.093023255813954,
"grad_norm": 0.45556584000587463,
"kl": 1.263916015625,
"learning_rate": 1.7840914629334122e-06,
"loss": 0.1099,
"reward": 0.08571428991854191,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 383.6607360839844,
"epoch": 5.108527131782946,
"grad_norm": 0.08392675966024399,
"kl": 0.09619140625,
"learning_rate": 1.7709487255338731e-06,
"loss": 0.0387,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 438.5357360839844,
"epoch": 5.124031007751938,
"grad_norm": 0.34529322385787964,
"kl": 0.2061767578125,
"learning_rate": 1.7578470671649684e-06,
"loss": 0.0743,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 467.55359649658203,
"epoch": 5.1395348837209305,
"grad_norm": 0.25744950771331787,
"kl": 0.1669921875,
"learning_rate": 1.744787037546045e-06,
"loss": 0.0932,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 478.5893096923828,
"epoch": 5.155038759689923,
"grad_norm": 0.19755351543426514,
"kl": 0.215087890625,
"learning_rate": 1.731769184649788e-06,
"loss": 0.0735,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 520.6428833007812,
"epoch": 5.170542635658915,
"grad_norm": 0.23117099702358246,
"kl": 0.2330322265625,
"learning_rate": 1.7187940546792325e-06,
"loss": 0.1332,
"reward": 0.08750000596046448,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 416.58929443359375,
"epoch": 5.186046511627907,
"grad_norm": 0.45335835218429565,
"kl": 0.361083984375,
"learning_rate": 1.7058621920448465e-06,
"loss": 0.0598,
"reward": 0.09107143431901932,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 446.3035888671875,
"epoch": 5.2015503875969,
"grad_norm": 0.2482314556837082,
"kl": 0.2408447265625,
"learning_rate": 1.6929741393416855e-06,
"loss": 0.0759,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 463.66073989868164,
"epoch": 5.217054263565892,
"grad_norm": 0.6526241302490234,
"kl": 0.2216796875,
"learning_rate": 1.6801304373266286e-06,
"loss": 0.0948,
"reward": 0.09107143618166447,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 579.6607437133789,
"epoch": 5.232558139534884,
"grad_norm": 0.7119612693786621,
"kl": 0.199951171875,
"learning_rate": 1.667331624895689e-06,
"loss": 0.1398,
"reward": 0.08571429178118706,
"reward_std": 0.02020305162295699,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 421.7857437133789,
"epoch": 5.248062015503876,
"grad_norm": 0.12730364501476288,
"kl": 0.1505126953125,
"learning_rate": 1.6545782390614037e-06,
"loss": 0.0542,
"reward": 0.09464286640286446,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 558.1428756713867,
"epoch": 5.263565891472869,
"grad_norm": 0.6476343274116516,
"kl": 1.1666259765625,
"learning_rate": 1.6418708149302992e-06,
"loss": 0.1135,
"reward": 0.08392857760190964,
"reward_std": 0.017677670111879706,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 563.1250305175781,
"epoch": 5.27906976744186,
"grad_norm": 0.5077939033508301,
"kl": 1.557861328125,
"learning_rate": 1.6292098856804423e-06,
"loss": 0.1212,
"reward": 0.08392857760190964,
"reward_std": 0.022728433599695563,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 542.3036041259766,
"epoch": 5.294573643410852,
"grad_norm": 0.2610304057598114,
"kl": 0.2861328125,
"learning_rate": 1.6165959825390661e-06,
"loss": 0.0738,
"reward": 0.08571429178118706,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 726.9286041259766,
"epoch": 5.310077519379845,
"grad_norm": 0.8110432624816895,
"kl": 0.67041015625,
"learning_rate": 1.604029634760284e-06,
"loss": 0.2361,
"reward": 0.07500000298023224,
"reward_std": 0.03030457766726613,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7500000298023224,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 639.2143249511719,
"epoch": 5.325581395348837,
"grad_norm": 0.5734603404998779,
"kl": 0.221923828125,
"learning_rate": 1.59151136960288e-06,
"loss": 0.1314,
"reward": 0.07857143133878708,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7857143133878708,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 612.1607437133789,
"epoch": 5.341085271317829,
"grad_norm": 0.37599024176597595,
"kl": 0.360595703125,
"learning_rate": 1.5790417123081903e-06,
"loss": 0.1625,
"reward": 0.08214286342263222,
"reward_std": 0.025253814877942204,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 552.0357360839844,
"epoch": 5.3565891472868215,
"grad_norm": 0.21845099329948425,
"kl": 0.2716064453125,
"learning_rate": 1.5666211860780583e-06,
"loss": 0.1555,
"reward": 0.08571429550647736,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 583.9464416503906,
"epoch": 5.372093023255814,
"grad_norm": 0.6591483950614929,
"kl": 0.310791015625,
"learning_rate": 1.5542503120528918e-06,
"loss": 0.143,
"reward": 0.08571428991854191,
"reward_std": 0.02020305162295699,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 686.3214569091797,
"epoch": 5.387596899224806,
"grad_norm": 0.49289828538894653,
"kl": 0.2916259765625,
"learning_rate": 1.5419296092897866e-06,
"loss": 0.1556,
"reward": 0.07678572088479996,
"reward_std": 0.02272843336686492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7678571939468384,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 575.5357437133789,
"epoch": 5.403100775193798,
"grad_norm": 0.5644925832748413,
"kl": 1.2939453125,
"learning_rate": 1.529659594740755e-06,
"loss": 0.1451,
"reward": 0.08392857760190964,
"reward_std": 0.022728433599695563,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857313156128,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 665.3571853637695,
"epoch": 5.4186046511627906,
"grad_norm": 0.4430839717388153,
"kl": 0.51171875,
"learning_rate": 1.5174407832310338e-06,
"loss": 0.1747,
"reward": 0.0803571492433548,
"reward_std": 0.022728433599695563,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 569.732177734375,
"epoch": 5.434108527131783,
"grad_norm": 0.48753219842910767,
"kl": 0.559326171875,
"learning_rate": 1.5052736874374815e-06,
"loss": 0.1818,
"reward": 0.08214286342263222,
"reward_std": 0.025253814877942204,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 604.0714492797852,
"epoch": 5.449612403100775,
"grad_norm": 1.0126312971115112,
"kl": 0.509765625,
"learning_rate": 1.4931588178670695e-06,
"loss": 0.1626,
"reward": 0.08214286155998707,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 641.9643096923828,
"epoch": 5.465116279069767,
"grad_norm": 0.4418766498565674,
"kl": 0.4609375,
"learning_rate": 1.4810966828354605e-06,
"loss": 0.2133,
"reward": 0.08035714738070965,
"reward_std": 0.027779195923358202,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 672.5178833007812,
"epoch": 5.48062015503876,
"grad_norm": 0.30397123098373413,
"kl": 0.34521484375,
"learning_rate": 1.469087788445684e-06,
"loss": 0.1997,
"reward": 0.0803571492433548,
"reward_std": 0.02777919638901949,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 607.7678985595703,
"epoch": 5.496124031007752,
"grad_norm": 0.5019213557243347,
"kl": 0.195068359375,
"learning_rate": 1.4571326385668965e-06,
"loss": 0.1518,
"reward": 0.08392857946455479,
"reward_std": 0.022728433832526207,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 552.5000228881836,
"epoch": 5.511627906976744,
"grad_norm": 1.0830461978912354,
"kl": 2.527099609375,
"learning_rate": 1.4452317348132434e-06,
"loss": 0.1191,
"reward": 0.08214286155998707,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 604.357177734375,
"epoch": 5.5271317829457365,
"grad_norm": 1.086542010307312,
"kl": 0.6982421875,
"learning_rate": 1.4333855765228104e-06,
"loss": 0.1066,
"reward": 0.0803571492433548,
"reward_std": 0.017677670111879706,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 553.1428756713867,
"epoch": 5.542635658914729,
"grad_norm": 0.7319161891937256,
"kl": 0.232177734375,
"learning_rate": 1.421594660736675e-06,
"loss": 0.1141,
"reward": 0.08571429178118706,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428805589676,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 666.678596496582,
"epoch": 5.558139534883721,
"grad_norm": 0.21619383990764618,
"kl": 0.128662109375,
"learning_rate": 1.4098594821780476e-06,
"loss": 0.1202,
"reward": 0.08035714738070965,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 540.1428680419922,
"epoch": 5.573643410852713,
"grad_norm": 0.1556929498910904,
"kl": 0.206787109375,
"learning_rate": 1.3981805332315174e-06,
"loss": 0.1111,
"reward": 0.08571429178118706,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428805589676,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 496.2857322692871,
"epoch": 5.589147286821706,
"grad_norm": 0.15184734761714935,
"kl": 0.169921875,
"learning_rate": 1.3865583039223929e-06,
"loss": 0.0747,
"reward": 0.08928572200238705,
"reward_std": 0.010101525811478496,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 474.26788330078125,
"epoch": 5.604651162790698,
"grad_norm": 0.7148901224136353,
"kl": 1.36328125,
"learning_rate": 1.374993281896137e-06,
"loss": 0.0816,
"reward": 0.0892857238650322,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 391.1607360839844,
"epoch": 5.62015503875969,
"grad_norm": 0.2310057431459427,
"kl": 0.188720703125,
"learning_rate": 1.3634859523979134e-06,
"loss": 0.0509,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 537.8393020629883,
"epoch": 5.635658914728682,
"grad_norm": 0.5635959506034851,
"kl": 2.5693359375,
"learning_rate": 1.3520367982522208e-06,
"loss": 0.0846,
"reward": 0.08214286342263222,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286267757416,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 483.4643096923828,
"epoch": 5.651162790697675,
"grad_norm": 0.26445066928863525,
"kl": 0.192626953125,
"learning_rate": 1.3406462998426358e-06,
"loss": 0.0337,
"reward": 0.0892857201397419,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 530.2143173217773,
"epoch": 5.666666666666667,
"grad_norm": 0.4850466847419739,
"kl": 0.1839599609375,
"learning_rate": 1.3293149350916595e-06,
"loss": 0.1534,
"reward": 0.0857142936438322,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 407.9464569091797,
"epoch": 5.682170542635659,
"grad_norm": 0.11648667603731155,
"kl": 0.1539306640625,
"learning_rate": 1.3180431794406623e-06,
"loss": 0.0207,
"reward": 0.09464286454021931,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 437.0714416503906,
"epoch": 5.6976744186046515,
"grad_norm": 0.2448117733001709,
"kl": 0.2156982421875,
"learning_rate": 1.3068315058299358e-06,
"loss": 0.0762,
"reward": 0.09285715036094189,
"reward_std": 0.010101525811478496,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714477300644,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 483.6607360839844,
"epoch": 5.713178294573644,
"grad_norm": 0.6701087951660156,
"kl": 0.1866455078125,
"learning_rate": 1.2956803846788503e-06,
"loss": 0.0674,
"reward": 0.08750000782310963,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 504.6250305175781,
"epoch": 5.728682170542635,
"grad_norm": 0.8088942766189575,
"kl": 0.3497314453125,
"learning_rate": 1.284590283866116e-06,
"loss": 0.1427,
"reward": 0.08750000409781933,
"reward_std": 0.017677670111879706,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 677.2321624755859,
"epoch": 5.7441860465116275,
"grad_norm": 0.19237665832042694,
"kl": 0.2906494140625,
"learning_rate": 1.2735616687101518e-06,
"loss": 0.1596,
"reward": 0.07678571715950966,
"reward_std": 0.022728433134034276,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7678571790456772,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 480.80359649658203,
"epoch": 5.75968992248062,
"grad_norm": 1.9329370260238647,
"kl": 1.5391845703125,
"learning_rate": 1.2625950019495614e-06,
"loss": 0.1103,
"reward": 0.08750000409781933,
"reward_std": 0.017677670111879706,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 662.2143096923828,
"epoch": 5.775193798449612,
"grad_norm": 0.368585467338562,
"kl": 0.1148681640625,
"learning_rate": 1.251690743723718e-06,
"loss": 0.1846,
"reward": 0.0803571492433548,
"reward_std": 0.027779196621850133,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714626312256,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 546.0000152587891,
"epoch": 5.790697674418604,
"grad_norm": 0.2216712236404419,
"kl": 0.20556640625,
"learning_rate": 1.2408493515534581e-06,
"loss": 0.0727,
"reward": 0.08571429178118706,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 663.9643173217773,
"epoch": 5.8062015503875966,
"grad_norm": 0.8497912883758545,
"kl": 0.2208251953125,
"learning_rate": 1.2300712803218834e-06,
"loss": 0.182,
"reward": 0.07500000484287739,
"reward_std": 0.03030457766726613,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7500000447034836,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 643.928596496582,
"epoch": 5.821705426356589,
"grad_norm": 0.9936148524284363,
"kl": 1.080078125,
"learning_rate": 1.2193569822552772e-06,
"loss": 0.1875,
"reward": 0.07857143506407738,
"reward_std": 0.02525381464511156,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7857143431901932,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 744.1250610351562,
"epoch": 5.837209302325581,
"grad_norm": 0.7761398553848267,
"kl": 1.13818359375,
"learning_rate": 1.2087069069041268e-06,
"loss": 0.1931,
"reward": 0.07321429066359997,
"reward_std": 0.02777919638901949,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7321428954601288,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 729.5357360839844,
"epoch": 5.852713178294573,
"grad_norm": 0.48130375146865845,
"kl": 0.517333984375,
"learning_rate": 1.1981215011242654e-06,
"loss": 0.1987,
"reward": 0.07500000670552254,
"reward_std": 0.025253815343603492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7500000447034836,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 743.2857513427734,
"epoch": 5.868217054263566,
"grad_norm": 0.4081970155239105,
"kl": 1.74365234375,
"learning_rate": 1.1876012090581184e-06,
"loss": 0.2046,
"reward": 0.0714285746216774,
"reward_std": 0.030304577900096774,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.714285746216774,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 889.1964721679688,
"epoch": 5.883720930232558,
"grad_norm": 0.2974870502948761,
"kl": 0.357666015625,
"learning_rate": 1.177146472116071e-06,
"loss": 0.1796,
"reward": 0.06607143208384514,
"reward_std": 0.022728433832526207,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6607143133878708,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 800.3214721679688,
"epoch": 5.89922480620155,
"grad_norm": 1.8590128421783447,
"kl": 2.197265625,
"learning_rate": 1.1667577289579462e-06,
"loss": 0.2563,
"reward": 0.06785714626312256,
"reward_std": 0.0353553406894207,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6785714775323868,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 828.8214721679688,
"epoch": 5.9147286821705425,
"grad_norm": 1.9380897283554077,
"kl": 1.0203857421875,
"learning_rate": 1.1564354154746007e-06,
"loss": 0.183,
"reward": 0.06964286044239998,
"reward_std": 0.02272843336686492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6964285969734192,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 502.98217391967773,
"epoch": 5.930232558139535,
"grad_norm": 0.8601269125938416,
"kl": 2.1402587890625,
"learning_rate": 1.146179964769635e-06,
"loss": 0.0414,
"reward": 0.08392857760190964,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.839285746216774,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 557.3393096923828,
"epoch": 5.945736434108527,
"grad_norm": 0.22515276074409485,
"kl": 0.217041015625,
"learning_rate": 1.1359918071412195e-06,
"loss": 0.097,
"reward": 0.08392857573926449,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 464.10717010498047,
"epoch": 5.961240310077519,
"grad_norm": 1.7024885416030884,
"kl": 1.64404296875,
"learning_rate": 1.1258713700640456e-06,
"loss": 0.0753,
"reward": 0.08750000596046448,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 425.8035888671875,
"epoch": 5.976744186046512,
"grad_norm": 0.17091427743434906,
"kl": 0.12548828125,
"learning_rate": 1.115819078171383e-06,
"loss": 0.0727,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 580.96875,
"epoch": 5.992248062015504,
"grad_norm": 0.2202906310558319,
"kl": 0.183349609375,
"learning_rate": 1.1058353532372667e-06,
"loss": 0.1451,
"reward": 0.08571429178118706,
"reward_std": 0.02020305162295699,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 501.57144927978516,
"epoch": 6.015503875968992,
"grad_norm": 0.42733341455459595,
"kl": 0.106689453125,
"learning_rate": 1.0959206141587998e-06,
"loss": 0.1143,
"reward": 0.08928571827709675,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 477.94644927978516,
"epoch": 6.0310077519379846,
"grad_norm": 0.15901905298233032,
"kl": 0.1177978515625,
"learning_rate": 1.0860752769385766e-06,
"loss": 0.0928,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 368.4285888671875,
"epoch": 6.046511627906977,
"grad_norm": 0.8356048464775085,
"kl": 0.5009765625,
"learning_rate": 1.0762997546672279e-06,
"loss": 0.0306,
"reward": 0.09464286267757416,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 537.2321701049805,
"epoch": 6.062015503875969,
"grad_norm": 0.337862104177475,
"kl": 0.1614990234375,
"learning_rate": 1.0665944575060914e-06,
"loss": 0.1243,
"reward": 0.08750000782310963,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 498.9821548461914,
"epoch": 6.077519379844961,
"grad_norm": 1.3160614967346191,
"kl": 1.275390625,
"learning_rate": 1.056959792669997e-06,
"loss": 0.1232,
"reward": 0.08392857760190964,
"reward_std": 0.022728433134034276,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 455.35716247558594,
"epoch": 6.093023255813954,
"grad_norm": 0.6817833185195923,
"kl": 0.56591796875,
"learning_rate": 1.0473961644101856e-06,
"loss": 0.0979,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 460.05359649658203,
"epoch": 6.108527131782946,
"grad_norm": 0.9967363476753235,
"kl": 0.3055419921875,
"learning_rate": 1.037903973997345e-06,
"loss": 0.0846,
"reward": 0.08750000782310963,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 567.803596496582,
"epoch": 6.124031007751938,
"grad_norm": 0.22702349722385406,
"kl": 0.1322021484375,
"learning_rate": 1.0284836197047737e-06,
"loss": 0.1403,
"reward": 0.0857142936438322,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 586.8393020629883,
"epoch": 6.1395348837209305,
"grad_norm": 0.20280596613883972,
"kl": 0.182373046875,
"learning_rate": 1.0191354967916712e-06,
"loss": 0.0784,
"reward": 0.08214286528527737,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286267757416,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 549.9107360839844,
"epoch": 6.155038759689923,
"grad_norm": 0.27999112010002136,
"kl": 0.1822509765625,
"learning_rate": 1.0098599974865515e-06,
"loss": 0.1597,
"reward": 0.0857142936438322,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 518.0893020629883,
"epoch": 6.170542635658915,
"grad_norm": 0.2629588842391968,
"kl": 0.184814453125,
"learning_rate": 1.0006575109707898e-06,
"loss": 0.0787,
"reward": 0.0892857201397419,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 547.4107360839844,
"epoch": 6.186046511627907,
"grad_norm": 0.5427741408348083,
"kl": 0.40625,
"learning_rate": 9.915284233622877e-07,
"loss": 0.1505,
"reward": 0.08392857760190964,
"reward_std": 0.022728433599695563,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 537.7500228881836,
"epoch": 6.2015503875969,
"grad_norm": 0.4367233216762543,
"kl": 0.438232421875,
"learning_rate": 9.824731176992796e-07,
"loss": 0.1517,
"reward": 0.08571429178118706,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 617.9821701049805,
"epoch": 6.217054263565892,
"grad_norm": 0.336049884557724,
"kl": 0.306640625,
"learning_rate": 9.734919739242543e-07,
"loss": 0.1765,
"reward": 0.0803571492433548,
"reward_std": 0.022728433599695563,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 599.1964569091797,
"epoch": 6.232558139534884,
"grad_norm": 0.5920295119285583,
"kl": 0.416259765625,
"learning_rate": 9.645853688680177e-07,
"loss": 0.1458,
"reward": 0.08214286528527737,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286267757416,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 540.6428833007812,
"epoch": 6.248062015503876,
"grad_norm": 0.3487648665904999,
"kl": 0.394775390625,
"learning_rate": 9.557536762338786e-07,
"loss": 0.112,
"reward": 0.08750000596046448,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 528.6071701049805,
"epoch": 6.263565891472869,
"grad_norm": 0.507084846496582,
"kl": 0.4010009765625,
"learning_rate": 9.46997266581973e-07,
"loss": 0.1125,
"reward": 0.08571429178118706,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428805589676,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 561.0000305175781,
"epoch": 6.27906976744186,
"grad_norm": 0.4561972916126251,
"kl": 0.3779296875,
"learning_rate": 9.383165073137115e-07,
"loss": 0.0993,
"reward": 0.08392857946455479,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 429.57144927978516,
"epoch": 6.294573643410852,
"grad_norm": 0.48230594396591187,
"kl": 0.860107421875,
"learning_rate": 9.297117626563687e-07,
"loss": 0.0911,
"reward": 0.08928572200238705,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 590.428581237793,
"epoch": 6.310077519379845,
"grad_norm": 0.8593739867210388,
"kl": 2.3194580078125,
"learning_rate": 9.211833936477957e-07,
"loss": 0.1887,
"reward": 0.07857143506407738,
"reward_std": 0.03030457836575806,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.785714328289032,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 448.3393096923828,
"epoch": 6.325581395348837,
"grad_norm": 0.32057252526283264,
"kl": 0.25048828125,
"learning_rate": 9.127317581212753e-07,
"loss": 0.0977,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 484.67859268188477,
"epoch": 6.341085271317829,
"grad_norm": 0.534175455570221,
"kl": 0.37890625,
"learning_rate": 9.043572106905084e-07,
"loss": 0.1328,
"reward": 0.08750000782310963,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 470.5535888671875,
"epoch": 6.3565891472868215,
"grad_norm": 0.475492924451828,
"kl": 1.1109619140625,
"learning_rate": 8.960601027347321e-07,
"loss": 0.1104,
"reward": 0.08750000409781933,
"reward_std": 0.017677670111879706,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 570.3214416503906,
"epoch": 6.372093023255814,
"grad_norm": 0.6804907917976379,
"kl": 0.504150390625,
"learning_rate": 8.878407823839788e-07,
"loss": 0.1259,
"reward": 0.08392857946455479,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 520.8928833007812,
"epoch": 6.387596899224806,
"grad_norm": 0.31301963329315186,
"kl": 0.2147216796875,
"learning_rate": 8.796995945044689e-07,
"loss": 0.0639,
"reward": 0.08750000782310963,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 400.62500762939453,
"epoch": 6.403100775193798,
"grad_norm": 0.1473621129989624,
"kl": 0.173095703125,
"learning_rate": 8.716368806841405e-07,
"loss": 0.0833,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 459.0714569091797,
"epoch": 6.4186046511627906,
"grad_norm": 0.17894765734672546,
"kl": 0.1246337890625,
"learning_rate": 8.636529792183171e-07,
"loss": 0.074,
"reward": 0.0892857201397419,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 485.55359649658203,
"epoch": 6.434108527131783,
"grad_norm": 0.17360633611679077,
"kl": 0.1624755859375,
"learning_rate": 8.557482250955144e-07,
"loss": 0.1076,
"reward": 0.0892857201397419,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 536.5893173217773,
"epoch": 6.449612403100775,
"grad_norm": 1.2795969247817993,
"kl": 0.708984375,
"learning_rate": 8.479229499833844e-07,
"loss": 0.1615,
"reward": 0.0857142936438322,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 508.01788330078125,
"epoch": 6.465116279069767,
"grad_norm": 0.28875473141670227,
"kl": 0.2587890625,
"learning_rate": 8.401774822147976e-07,
"loss": 0.1362,
"reward": 0.08750000782310963,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 560.0178909301758,
"epoch": 6.48062015503876,
"grad_norm": 0.5606586337089539,
"kl": 0.267333984375,
"learning_rate": 8.325121467740695e-07,
"loss": 0.1107,
"reward": 0.08571428991854191,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428805589676,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 530.4643096923828,
"epoch": 6.496124031007752,
"grad_norm": 0.5019002556800842,
"kl": 1.6973876953125,
"learning_rate": 8.249272652833226e-07,
"loss": 0.0951,
"reward": 0.08571429550647736,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571429252624512,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 549.6250228881836,
"epoch": 6.511627906976744,
"grad_norm": 0.23863928020000458,
"kl": 0.21630859375,
"learning_rate": 8.174231559889931e-07,
"loss": 0.1364,
"reward": 0.08392857387661934,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.839285746216774,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 572.4285888671875,
"epoch": 6.5271317829457365,
"grad_norm": 0.3975156247615814,
"kl": 1.265869140625,
"learning_rate": 8.100001337484787e-07,
"loss": 0.1362,
"reward": 0.08214285969734192,
"reward_std": 0.02020305162295699,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 455.23216247558594,
"epoch": 6.542635658914729,
"grad_norm": 0.2875515818595886,
"kl": 0.726806640625,
"learning_rate": 8.026585100169251e-07,
"loss": 0.0998,
"reward": 0.09107143431901932,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 564.5714492797852,
"epoch": 6.558139534883721,
"grad_norm": 0.4496704339981079,
"kl": 0.2958984375,
"learning_rate": 7.953985928341601e-07,
"loss": 0.1502,
"reward": 0.0857142936438322,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 558.9643096923828,
"epoch": 6.573643410852713,
"grad_norm": 0.5100883841514587,
"kl": 1.135009765625,
"learning_rate": 7.882206868117693e-07,
"loss": 0.0882,
"reward": 0.08214286342263222,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 636.2143096923828,
"epoch": 6.589147286821706,
"grad_norm": 0.7258642315864563,
"kl": 1.04638671875,
"learning_rate": 7.81125093120313e-07,
"loss": 0.1719,
"reward": 0.07857143506407738,
"reward_std": 0.025253815343603492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7857143431901932,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 622.4464492797852,
"epoch": 6.604651162790698,
"grad_norm": 0.47343626618385315,
"kl": 0.2442626953125,
"learning_rate": 7.741121094766916e-07,
"loss": 0.166,
"reward": 0.08035714738070965,
"reward_std": 0.022728433134034276,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714775323868,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 628.3928833007812,
"epoch": 6.62015503875969,
"grad_norm": 0.1919441670179367,
"kl": 0.190673828125,
"learning_rate": 7.671820301316532e-07,
"loss": 0.1353,
"reward": 0.08035714738070965,
"reward_std": 0.01767767034471035,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714626312256,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 513.9107246398926,
"epoch": 6.635658914728682,
"grad_norm": 0.5142886638641357,
"kl": 1.94091796875,
"learning_rate": 7.603351458574474e-07,
"loss": 0.1437,
"reward": 0.08035714738070965,
"reward_std": 0.02272843336686492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714477300644,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 474.89288330078125,
"epoch": 6.651162790697675,
"grad_norm": 0.16202746331691742,
"kl": 0.130859375,
"learning_rate": 7.535717439356255e-07,
"loss": 0.0393,
"reward": 0.0892857201397419,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 627.6785888671875,
"epoch": 6.666666666666667,
"grad_norm": 0.4151498079299927,
"kl": 0.4073486328125,
"learning_rate": 7.46892108144986e-07,
"loss": 0.1021,
"reward": 0.08035714738070965,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8035714626312256,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 461.4107322692871,
"epoch": 6.682170542635659,
"grad_norm": 0.9489464163780212,
"kl": 2.072998046875,
"learning_rate": 7.402965187496697e-07,
"loss": 0.0601,
"reward": 0.08750000968575478,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000596046448,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 580.928596496582,
"epoch": 6.6976744186046515,
"grad_norm": 0.6491573452949524,
"kl": 0.584716796875,
"learning_rate": 7.337852524873974e-07,
"loss": 0.0931,
"reward": 0.08392857760190964,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.839285746216774,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 533.8035888671875,
"epoch": 6.713178294573644,
"grad_norm": 0.38969844579696655,
"kl": 0.188232421875,
"learning_rate": 7.273585825578608e-07,
"loss": 0.1114,
"reward": 0.0857142936438322,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 692.3036041259766,
"epoch": 6.728682170542635,
"grad_norm": 0.8381134867668152,
"kl": 0.2666015625,
"learning_rate": 7.21016778611259e-07,
"loss": 0.2135,
"reward": 0.07678571902215481,
"reward_std": 0.027779196621850133,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.767857164144516,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 510.71431732177734,
"epoch": 6.7441860465116275,
"grad_norm": 0.562402069568634,
"kl": 0.271484375,
"learning_rate": 7.147601067369835e-07,
"loss": 0.0545,
"reward": 0.08571429178118706,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 550.9821624755859,
"epoch": 6.75968992248062,
"grad_norm": 0.7675048112869263,
"kl": 0.66845703125,
"learning_rate": 7.085888294524561e-07,
"loss": 0.122,
"reward": 0.0857142936438322,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 569.0000305175781,
"epoch": 6.775193798449612,
"grad_norm": 0.7039144039154053,
"kl": 0.5224609375,
"learning_rate": 7.025032056921117e-07,
"loss": 0.1336,
"reward": 0.08392857573926449,
"reward_std": 0.022728432901203632,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.839285746216774,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 644.6428833007812,
"epoch": 6.790697674418604,
"grad_norm": 0.642671525478363,
"kl": 0.45849609375,
"learning_rate": 6.965034907965349e-07,
"loss": 0.2222,
"reward": 0.07678571902215481,
"reward_std": 0.03282995941117406,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7678571939468384,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 651.3928756713867,
"epoch": 6.8062015503875966,
"grad_norm": 0.9575018286705017,
"kl": 1.3515625,
"learning_rate": 6.905899365017462e-07,
"loss": 0.1341,
"reward": 0.07678571902215481,
"reward_std": 0.02272843336686492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7678571790456772,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 517.9285888671875,
"epoch": 6.821705426356589,
"grad_norm": 1.0209612846374512,
"kl": 0.86376953125,
"learning_rate": 6.847627909286409e-07,
"loss": 0.1229,
"reward": 0.0857142936438322,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 689.7500457763672,
"epoch": 6.837209302325581,
"grad_norm": 0.41212135553359985,
"kl": 0.3935546875,
"learning_rate": 6.790222985725761e-07,
"loss": 0.2152,
"reward": 0.07678571902215481,
"reward_std": 0.02777919638901949,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.7678571790456772,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 658.1607437133789,
"epoch": 6.852713178294573,
"grad_norm": 0.4613673686981201,
"kl": 0.466552734375,
"learning_rate": 6.733687002931141e-07,
"loss": 0.1572,
"reward": 0.07857143320143223,
"reward_std": 0.02020305162295699,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.785714328289032,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 762.0893096923828,
"epoch": 6.868217054263566,
"grad_norm": 0.6260417103767395,
"kl": 0.501708984375,
"learning_rate": 6.678022333039158e-07,
"loss": 0.2349,
"reward": 0.0714285746216774,
"reward_std": 0.0353553406894207,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.714285746216774,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 552.803596496582,
"epoch": 6.883720930232558,
"grad_norm": 0.4210367798805237,
"kl": 0.265625,
"learning_rate": 6.623231311627876e-07,
"loss": 0.1448,
"reward": 0.08392857573926449,
"reward_std": 0.017677670111879706,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 562.7500228881836,
"epoch": 6.89922480620155,
"grad_norm": 0.2927769124507904,
"kl": 0.3931884765625,
"learning_rate": 6.569316237618811e-07,
"loss": 0.1344,
"reward": 0.08392857760190964,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 424.03573989868164,
"epoch": 6.9147286821705425,
"grad_norm": 0.2194661796092987,
"kl": 0.30908203125,
"learning_rate": 6.516279373180499e-07,
"loss": 0.0665,
"reward": 0.09107143618166447,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107142984867096,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 527.428596496582,
"epoch": 6.930232558139535,
"grad_norm": 0.3145492672920227,
"kl": 0.4027099609375,
"learning_rate": 6.464122943633543e-07,
"loss": 0.1228,
"reward": 0.0857142936438322,
"reward_std": 0.02020305208861828,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 350.0178756713867,
"epoch": 6.945736434108527,
"grad_norm": 0.3451617658138275,
"kl": 0.1922607421875,
"learning_rate": 6.412849137357271e-07,
"loss": 0.0394,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 512.9821701049805,
"epoch": 6.961240310077519,
"grad_norm": 0.6971478462219238,
"kl": 0.8720703125,
"learning_rate": 6.3624601056979e-07,
"loss": 0.0821,
"reward": 0.08571429178118706,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 373.4464416503906,
"epoch": 6.976744186046512,
"grad_norm": 0.2429589033126831,
"kl": 0.1673583984375,
"learning_rate": 6.312957962878278e-07,
"loss": 0.0339,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 419.25,
"epoch": 6.992248062015504,
"grad_norm": 0.7322734594345093,
"kl": 0.724365234375,
"learning_rate": 6.264344785909181e-07,
"loss": 0.0543,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 322.08929443359375,
"epoch": 7.015503875968992,
"grad_norm": 0.11766334623098373,
"kl": 0.0931396484375,
"learning_rate": 6.216622614502149e-07,
"loss": 0.0416,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 468.3035888671875,
"epoch": 7.0310077519379846,
"grad_norm": 0.3541572093963623,
"kl": 0.212646484375,
"learning_rate": 6.169793450983916e-07,
"loss": 0.0952,
"reward": 0.09107143431901932,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 455.50001525878906,
"epoch": 7.046511627906977,
"grad_norm": 0.1937684863805771,
"kl": 0.177001953125,
"learning_rate": 6.123859260212393e-07,
"loss": 0.1133,
"reward": 0.08928572200238705,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 467.4107360839844,
"epoch": 7.062015503875969,
"grad_norm": 0.21939167380332947,
"kl": 0.1229248046875,
"learning_rate": 6.07882196949423e-07,
"loss": 0.0914,
"reward": 0.09107143431901932,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107142984867096,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 455.4821548461914,
"epoch": 7.077519379844961,
"grad_norm": 0.30737853050231934,
"kl": 0.302490234375,
"learning_rate": 6.034683468503948e-07,
"loss": 0.0882,
"reward": 0.08928572200238705,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 368.7321662902832,
"epoch": 7.093023255813954,
"grad_norm": 1.252065896987915,
"kl": 0.3232421875,
"learning_rate": 5.991445609204641e-07,
"loss": 0.0627,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 405.5893020629883,
"epoch": 7.108527131782946,
"grad_norm": 0.13495229184627533,
"kl": 0.1036376953125,
"learning_rate": 5.949110205770292e-07,
"loss": 0.0504,
"reward": 0.09464286267757416,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 346.8928680419922,
"epoch": 7.124031007751938,
"grad_norm": 0.15541866421699524,
"kl": 0.1348876953125,
"learning_rate": 5.90767903450964e-07,
"loss": 0.0409,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 438.0000228881836,
"epoch": 7.1395348837209305,
"grad_norm": 0.16461335122585297,
"kl": 0.1226806640625,
"learning_rate": 5.867153833791652e-07,
"loss": 0.0759,
"reward": 0.09285714849829674,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 328.66072845458984,
"epoch": 7.155038759689923,
"grad_norm": 0.08290436118841171,
"kl": 0.1099853515625,
"learning_rate": 5.827536303972587e-07,
"loss": 0.0361,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 416.75001525878906,
"epoch": 7.170542635658915,
"grad_norm": 0.4567578434944153,
"kl": 1.5322265625,
"learning_rate": 5.78882810732465e-07,
"loss": 0.0584,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 314.1607246398926,
"epoch": 7.186046511627907,
"grad_norm": 0.949168860912323,
"kl": 0.59814453125,
"learning_rate": 5.75103086796625e-07,
"loss": 0.0286,
"reward": 0.09642858058214188,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 327.0893020629883,
"epoch": 7.2015503875969,
"grad_norm": 0.5412436127662659,
"kl": 0.7969970703125,
"learning_rate": 5.714146171793846e-07,
"loss": 0.0063,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 412.1785888671875,
"epoch": 7.217054263565892,
"grad_norm": 0.24841195344924927,
"kl": 0.1400146484375,
"learning_rate": 5.678175566415422e-07,
"loss": 0.0389,
"reward": 0.09285714849829674,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 481.60717010498047,
"epoch": 7.232558139534884,
"grad_norm": 1.5795103311538696,
"kl": 0.748291015625,
"learning_rate": 5.643120561085528e-07,
"loss": 0.0451,
"reward": 0.08750000596046448,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 483.3393020629883,
"epoch": 7.248062015503876,
"grad_norm": 0.3128277361392975,
"kl": 0.57806396484375,
"learning_rate": 5.608982626641991e-07,
"loss": 0.0626,
"reward": 0.08750000782310963,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000298023224,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 388.3750228881836,
"epoch": 7.263565891472869,
"grad_norm": 0.8693004250526428,
"kl": 1.3035888671875,
"learning_rate": 5.575763195444166e-07,
"loss": 0.0332,
"reward": 0.09285715222358704,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 541.7500305175781,
"epoch": 7.27906976744186,
"grad_norm": 0.2898370027542114,
"kl": 0.269287109375,
"learning_rate": 5.543463661312847e-07,
"loss": 0.1174,
"reward": 0.0857142936438322,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 366.8393020629883,
"epoch": 7.294573643410852,
"grad_norm": 0.2669702172279358,
"kl": 0.2113037109375,
"learning_rate": 5.512085379471808e-07,
"loss": 0.0394,
"reward": 0.09642857871949673,
"reward_std": 0.00505076302215457,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9642857313156128,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 462.58931732177734,
"epoch": 7.310077519379845,
"grad_norm": 1.6552106142044067,
"kl": 0.7685546875,
"learning_rate": 5.481629666490903e-07,
"loss": 0.1032,
"reward": 0.08928572200238705,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 540.9643020629883,
"epoch": 7.325581395348837,
"grad_norm": 0.23821362853050232,
"kl": 0.270263671875,
"learning_rate": 5.452097800230853e-07,
"loss": 0.1599,
"reward": 0.08571429178118706,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 437.1071548461914,
"epoch": 7.341085271317829,
"grad_norm": 0.28896719217300415,
"kl": 1.3438720703125,
"learning_rate": 5.423491019789623e-07,
"loss": 0.0397,
"reward": 0.08928572200238705,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 386.44644927978516,
"epoch": 7.3565891472868215,
"grad_norm": 0.4218304455280304,
"kl": 0.3291015625,
"learning_rate": 5.395810525450425e-07,
"loss": 0.0623,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 533.1428909301758,
"epoch": 7.372093023255814,
"grad_norm": 0.2983189821243286,
"kl": 0.1497802734375,
"learning_rate": 5.369057478631359e-07,
"loss": 0.153,
"reward": 0.0857142936438322,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 502.2678756713867,
"epoch": 7.387596899224806,
"grad_norm": 0.3067476451396942,
"kl": 0.297119140625,
"learning_rate": 5.343233001836694e-07,
"loss": 0.0749,
"reward": 0.0892857201397419,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 615.8928756713867,
"epoch": 7.403100775193798,
"grad_norm": 0.3363507390022278,
"kl": 0.287353515625,
"learning_rate": 5.318338178609754e-07,
"loss": 0.1077,
"reward": 0.08214286342263222,
"reward_std": 0.015152288833633065,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8214286118745804,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 595.1785888671875,
"epoch": 7.4186046511627906,
"grad_norm": 0.17296263575553894,
"kl": 0.2437744140625,
"learning_rate": 5.294374053487459e-07,
"loss": 0.0975,
"reward": 0.08392857760190964,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857313156128,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 464.4107360839844,
"epoch": 7.434108527131783,
"grad_norm": 1.2098743915557861,
"kl": 0.6441650390625,
"learning_rate": 5.271341631956511e-07,
"loss": 0.0544,
"reward": 0.08928572200238705,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.892857164144516,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 355.25001525878906,
"epoch": 7.449612403100775,
"grad_norm": 0.0816921591758728,
"kl": 0.100341796875,
"learning_rate": 5.249241880411181e-07,
"loss": 0.0204,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 451.66072845458984,
"epoch": 7.465116279069767,
"grad_norm": 0.1864636242389679,
"kl": 0.10595703125,
"learning_rate": 5.228075726112785e-07,
"loss": 0.0685,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 518.3571624755859,
"epoch": 7.48062015503876,
"grad_norm": 0.27271780371665955,
"kl": 0.115478515625,
"learning_rate": 5.207844057150768e-07,
"loss": 0.0914,
"reward": 0.08750000782310963,
"reward_std": 0.01262690732255578,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000447034836,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 502.76788330078125,
"epoch": 7.496124031007752,
"grad_norm": 0.25202253460884094,
"kl": 0.2041015625,
"learning_rate": 5.188547722405437e-07,
"loss": 0.1187,
"reward": 0.0892857238650322,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 470.0893020629883,
"epoch": 7.511627906976744,
"grad_norm": 0.33710911870002747,
"kl": 1.2530517578125,
"learning_rate": 5.170187531512351e-07,
"loss": 0.0767,
"reward": 0.0892857238650322,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 499.73216247558594,
"epoch": 7.5271317829457365,
"grad_norm": 0.12209226191043854,
"kl": 0.141845703125,
"learning_rate": 5.152764254828348e-07,
"loss": 0.0716,
"reward": 0.08928572200238705,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 456.92859649658203,
"epoch": 7.542635658914729,
"grad_norm": 0.266292005777359,
"kl": 0.1436767578125,
"learning_rate": 5.136278623399225e-07,
"loss": 0.0962,
"reward": 0.09107143618166447,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 333.2678756713867,
"epoch": 7.558139534883721,
"grad_norm": 0.7162752747535706,
"kl": 0.4906005859375,
"learning_rate": 5.120731328929058e-07,
"loss": 0.0237,
"reward": 0.09821429289877415,
"reward_std": 0.002525381511077285,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9821428656578064,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 456.1250228881836,
"epoch": 7.573643410852713,
"grad_norm": 0.4401909410953522,
"kl": 1.7235107421875,
"learning_rate": 5.106123023751187e-07,
"loss": 0.0754,
"reward": 0.0892857238650322,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 482.6250228881836,
"epoch": 7.589147286821706,
"grad_norm": 0.48811131715774536,
"kl": 0.47265625,
"learning_rate": 5.092454320800833e-07,
"loss": 0.1142,
"reward": 0.0892857201397419,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571939468384,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 517.1250228881836,
"epoch": 7.604651162790698,
"grad_norm": 0.28417670726776123,
"kl": 0.2467041015625,
"learning_rate": 5.079725793589405e-07,
"loss": 0.1384,
"reward": 0.08750000968575478,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000596046448,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 587.6964569091797,
"epoch": 7.62015503875969,
"grad_norm": 0.27875715494155884,
"kl": 0.3658447265625,
"learning_rate": 5.067937976180407e-07,
"loss": 0.1719,
"reward": 0.08392857946455479,
"reward_std": 0.02272843336686492,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857760190964,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 524.5893096923828,
"epoch": 7.635658914728682,
"grad_norm": 0.35742247104644775,
"kl": 0.27734375,
"learning_rate": 5.057091363167046e-07,
"loss": 0.0743,
"reward": 0.08928572200238705,
"reward_std": 0.010101525811478496,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 457.9285888671875,
"epoch": 7.651162790697675,
"grad_norm": 0.6690515875816345,
"kl": 0.390625,
"learning_rate": 5.047186409651489e-07,
"loss": 0.086,
"reward": 0.09107143804430962,
"reward_std": 0.012626907555386424,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.910714328289032,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 462.62501525878906,
"epoch": 7.666666666666667,
"grad_norm": 0.2108573466539383,
"kl": 0.175048828125,
"learning_rate": 5.038223531225742e-07,
"loss": 0.0587,
"reward": 0.09107143245637417,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9107143133878708,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 374.7321586608887,
"epoch": 7.682170542635659,
"grad_norm": 0.3733845353126526,
"kl": 0.33349609375,
"learning_rate": 5.030203103954232e-07,
"loss": 0.0632,
"reward": 0.09464286454021931,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9464285969734192,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 417.73216247558594,
"epoch": 7.6976744186046515,
"grad_norm": 0.43952879309654236,
"kl": 1.1201171875,
"learning_rate": 5.023125464358026e-07,
"loss": 0.0543,
"reward": 0.09285715036094189,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9285714626312256,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 549.8393020629883,
"epoch": 7.713178294573644,
"grad_norm": 0.29617685079574585,
"kl": 0.201416015625,
"learning_rate": 5.016990909400709e-07,
"loss": 0.113,
"reward": 0.0857142936438322,
"reward_std": 0.015152289066463709,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428954601288,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 558.8393020629883,
"epoch": 7.728682170542635,
"grad_norm": 0.6212316155433655,
"kl": 0.447265625,
"learning_rate": 5.011799696475915e-07,
"loss": 0.1539,
"reward": 0.0857142936438322,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.85714291036129,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 492.17860412597656,
"epoch": 7.7441860465116275,
"grad_norm": 0.23731939494609833,
"kl": 0.216064453125,
"learning_rate": 5.007552043396547e-07,
"loss": 0.071,
"reward": 0.0892857201397419,
"reward_std": 0.01010152604430914,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8928571790456772,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 559.7321701049805,
"epoch": 7.75968992248062,
"grad_norm": 0.44135406613349915,
"kl": 0.2109375,
"learning_rate": 5.004248128385618e-07,
"loss": 0.149,
"reward": 0.08571429178118706,
"reward_std": 0.020203051855787635,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8571428805589676,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 568.7321624755859,
"epoch": 7.775193798449612,
"grad_norm": 0.259131520986557,
"kl": 0.3043212890625,
"learning_rate": 5.001888090068784e-07,
"loss": 0.1772,
"reward": 0.08392857760190964,
"reward_std": 0.022728433599695563,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 525.1607437133789,
"epoch": 7.790697674418604,
"grad_norm": 0.23898068070411682,
"kl": 0.278564453125,
"learning_rate": 5.000472027468528e-07,
"loss": 0.1237,
"reward": 0.08750000968575478,
"reward_std": 0.017677670577540994,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8750000596046448,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 524.2143173217773,
"epoch": 7.8062015503875966,
"grad_norm": 0.4151113033294678,
"kl": 1.75830078125,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0621,
"reward": 0.08392857946455479,
"reward_std": 0.0075761445332318544,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8392857611179352,
"step": 500
},
{
"epoch": 7.8062015503875966,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.059448377258595884,
"train_runtime": 22252.9361,
"train_samples_per_second": 1.258,
"train_steps_per_second": 0.022
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}