Gullein / checkpoint-400 /trainer_state.json
Hasnonname's picture
Add files using upload-large-folder tool
7f1238e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1000.0000610351562,
"epoch": 0.0025,
"grad_norm": 0.2079561913831216,
"kl": 0.0158782958984375,
"learning_rate": 3.3333333333333334e-08,
"loss": 0.0,
"num_tokens": 52920.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.0,
"rewards/slop_reward": 1.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 179.22857971191405,
"epoch": 0.005,
"grad_norm": 12.249017739002824,
"kl": 0.00041656494140625,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0,
"num_tokens": 77113.0,
"reward": 1.8515485525131226,
"reward_std": 0.6024735510349274,
"rewards/classifier_reward": 0.37967347651720046,
"rewards/length_reward": 0.7142857313156128,
"rewards/slop_reward": 0.7575892865657806,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 133.4857208251953,
"epoch": 0.0075,
"grad_norm": 10089.085994752151,
"kl": 0.001357269287109375,
"learning_rate": 1e-07,
"loss": 0.0,
"num_tokens": 99305.0,
"reward": 1.5958443641662599,
"reward_std": 0.44231254458427427,
"rewards/classifier_reward": 0.4083442732691765,
"rewards/length_reward": 0.3428571462631226,
"rewards/slop_reward": 0.8446428537368774,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 215.08572387695312,
"epoch": 0.01,
"grad_norm": 2054.9417817778553,
"kl": 0.0014644622802734374,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0,
"num_tokens": 124753.0,
"reward": 1.2784200072288514,
"reward_std": 0.6290957629680634,
"rewards/classifier_reward": 0.1494020951911807,
"rewards/length_reward": 0.4571428656578064,
"rewards/slop_reward": 0.6718749940395355,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 173.74286346435548,
"epoch": 0.0125,
"grad_norm": 2266.5233619504957,
"kl": 0.001641082763671875,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.0,
"num_tokens": 148708.0,
"reward": 1.3791944861412049,
"reward_std": 0.43466432094573976,
"rewards/classifier_reward": 0.17383726984262465,
"rewards/length_reward": 0.5428571462631225,
"rewards/slop_reward": 0.6625,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 202.02857971191406,
"epoch": 0.015,
"grad_norm": 11.643315762932316,
"kl": 0.0005245208740234375,
"learning_rate": 2e-07,
"loss": 0.0,
"num_tokens": 173615.0,
"reward": 2.2288811445236205,
"reward_std": 0.4513823240995407,
"rewards/classifier_reward": 0.4342381663620472,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 0.9089285731315613,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 213.74286499023438,
"epoch": 0.0175,
"grad_norm": 13.650814377669256,
"kl": 0.000527191162109375,
"learning_rate": 2.3333333333333333e-07,
"loss": 0.0,
"num_tokens": 199016.0,
"reward": 1.9626029968261718,
"reward_std": 0.3459654450416565,
"rewards/classifier_reward": 0.39831727296113967,
"rewards/length_reward": 0.7142857164144516,
"rewards/slop_reward": 0.8499999940395355,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 288.71429443359375,
"epoch": 0.02,
"grad_norm": 10.936857808142296,
"kl": 0.0004474639892578125,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0,
"num_tokens": 227041.0,
"reward": 1.8008938789367677,
"reward_std": 0.3342688336968422,
"rewards/classifier_reward": 0.4794651668518782,
"rewards/length_reward": 0.4285714328289032,
"rewards/slop_reward": 0.8928571343421936,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 135.14286193847656,
"epoch": 0.0225,
"grad_norm": 13.869838227975858,
"kl": 0.000525665283203125,
"learning_rate": 3e-07,
"loss": 0.0005,
"num_tokens": 249691.0,
"reward": 1.9354748964309691,
"reward_std": 0.3361890375614166,
"rewards/classifier_reward": 0.5604747980833054,
"rewards/length_reward": 0.5142857193946838,
"rewards/slop_reward": 0.8607142925262451,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 208.91429748535157,
"epoch": 0.025,
"grad_norm": 12.652344276830467,
"kl": 0.0005481719970703125,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"num_tokens": 274843.0,
"reward": 1.943293523788452,
"reward_std": 0.6581979870796204,
"rewards/classifier_reward": 0.5843648463487625,
"rewards/length_reward": 0.6000000178813935,
"rewards/slop_reward": 0.7589285731315613,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 205.71429443359375,
"epoch": 0.0275,
"grad_norm": 15.071895621435214,
"kl": 0.00064544677734375,
"learning_rate": 3.666666666666666e-07,
"loss": 0.0,
"num_tokens": 299820.0,
"reward": 1.5763698101043702,
"reward_std": 0.4757842034101486,
"rewards/classifier_reward": 0.431726861000061,
"rewards/length_reward": 0.25714286863803865,
"rewards/slop_reward": 0.8875,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 230.4285858154297,
"epoch": 0.03,
"grad_norm": 190.50276677565853,
"kl": 0.014757537841796875,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 325730.0,
"reward": 2.0366717338562013,
"reward_std": 0.44329026341438293,
"rewards/classifier_reward": 0.3331002712249756,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 0.8178571462631226,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 205.62857971191406,
"epoch": 0.0325,
"grad_norm": 12.86535673011762,
"kl": 0.0007328033447265625,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.0,
"num_tokens": 350634.0,
"reward": 1.3644737243652343,
"reward_std": 0.5067215681076049,
"rewards/classifier_reward": 0.29661653861403464,
"rewards/length_reward": 0.31428571939468386,
"rewards/slop_reward": 0.7535714268684387,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 149.25715026855468,
"epoch": 0.035,
"grad_norm": 13.332730561008187,
"kl": 0.00087738037109375,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0,
"num_tokens": 373261.0,
"reward": 1.4552783489227294,
"reward_std": 0.6117016971111298,
"rewards/classifier_reward": 0.18384971991181373,
"rewards/length_reward": 0.40000001192092893,
"rewards/slop_reward": 0.8714285612106323,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 213.71429901123048,
"epoch": 0.0375,
"grad_norm": 12.15928533039066,
"kl": 0.001068878173828125,
"learning_rate": 5e-07,
"loss": 0.0,
"num_tokens": 398319.0,
"reward": 1.7259351372718812,
"reward_std": 0.5805591940879822,
"rewards/classifier_reward": 0.5187922030687332,
"rewards/length_reward": 0.48571428954601287,
"rewards/slop_reward": 0.7214285731315613,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 200.94286499023437,
"epoch": 0.04,
"grad_norm": 210.10906643069993,
"kl": 0.006036376953125,
"learning_rate": 5.333333333333333e-07,
"loss": 0.0,
"num_tokens": 423267.0,
"reward": 1.5157490253448487,
"reward_std": 0.4267912685871124,
"rewards/classifier_reward": 0.32646322101354597,
"rewards/length_reward": 0.37142857611179353,
"rewards/slop_reward": 0.8178571403026581,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 226.17143859863282,
"epoch": 0.0425,
"grad_norm": 10.900066010271587,
"kl": 0.0018096923828125,
"learning_rate": 5.666666666666666e-07,
"loss": 0.0,
"num_tokens": 449061.0,
"reward": 1.5792103052139281,
"reward_std": 0.5069970846176147,
"rewards/classifier_reward": 0.13992448002099991,
"rewards/length_reward": 0.6857142925262452,
"rewards/slop_reward": 0.7535714209079742,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 167.08572387695312,
"epoch": 0.045,
"grad_norm": 265.89705750792126,
"kl": 0.0086395263671875,
"learning_rate": 6e-07,
"loss": 0.0,
"num_tokens": 472825.0,
"reward": 1.8174922943115235,
"reward_std": 0.39020195603370667,
"rewards/classifier_reward": 0.3174922451376915,
"rewards/length_reward": 0.6285714387893677,
"rewards/slop_reward": 0.8714285731315613,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 150.85714721679688,
"epoch": 0.0475,
"grad_norm": 16.75575953327707,
"kl": 0.0071929931640625,
"learning_rate": 6.333333333333332e-07,
"loss": 0.0,
"num_tokens": 496025.0,
"reward": 1.9033495664596558,
"reward_std": 0.46038708090782166,
"rewards/classifier_reward": 0.5676352053880691,
"rewards/length_reward": 0.4000000059604645,
"rewards/slop_reward": 0.9357142806053161,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 170.20000610351562,
"epoch": 0.05,
"grad_norm": 12.507998039364391,
"kl": 0.008447265625,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"num_tokens": 519775.0,
"reward": 1.8573363780975343,
"reward_std": 0.46103876233100893,
"rewards/classifier_reward": 0.3716219961643219,
"rewards/length_reward": 0.6571428596973419,
"rewards/slop_reward": 0.8285714149475097,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 254.57144317626953,
"epoch": 0.0525,
"grad_norm": 10.08679593336001,
"kl": 0.005413818359375,
"learning_rate": 7e-07,
"loss": 0.0,
"num_tokens": 546566.0,
"reward": 1.2423678815364838,
"reward_std": 0.4237713754177094,
"rewards/classifier_reward": 0.2472785457968712,
"rewards/length_reward": 0.45714286267757415,
"rewards/slop_reward": 0.5379464238882065,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 231.60000915527343,
"epoch": 0.055,
"grad_norm": 120.69825805503362,
"kl": 0.01463623046875,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0,
"num_tokens": 572155.0,
"reward": 1.6102877855300903,
"reward_std": 0.47996904850006106,
"rewards/classifier_reward": 0.39421619176864625,
"rewards/length_reward": 0.45714286267757415,
"rewards/slop_reward": 0.7589285731315613,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 244.971435546875,
"epoch": 0.0575,
"grad_norm": 10.506951478641199,
"kl": 0.0114501953125,
"learning_rate": 7.666666666666667e-07,
"loss": 0.0,
"num_tokens": 598457.0,
"reward": 1.7265684366226197,
"reward_std": 0.5010857343673706,
"rewards/classifier_reward": 0.35513979494571685,
"rewards/length_reward": 0.542857152223587,
"rewards/slop_reward": 0.8285714268684388,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 212.17143859863282,
"epoch": 0.06,
"grad_norm": 14.496994349875411,
"kl": 0.019873046875,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 623795.0,
"reward": 1.9756683349609374,
"reward_std": 0.6098839461803436,
"rewards/classifier_reward": 0.3970968216657639,
"rewards/length_reward": 0.6857142925262452,
"rewards/slop_reward": 0.8928571343421936,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 149.71429138183595,
"epoch": 0.0625,
"grad_norm": 13.955034723905843,
"kl": 0.023046875,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0,
"num_tokens": 646708.0,
"reward": 1.7968461036682128,
"reward_std": 0.45076006054878237,
"rewards/classifier_reward": 0.3789888650178909,
"rewards/length_reward": 0.5142857193946838,
"rewards/slop_reward": 0.9035714268684387,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 172.02857818603516,
"epoch": 0.065,
"grad_norm": 13.086801218372846,
"kl": 0.03857421875,
"learning_rate": 8.666666666666667e-07,
"loss": 0.0,
"num_tokens": 670238.0,
"reward": 2.018597435951233,
"reward_std": 0.5030077040195465,
"rewards/classifier_reward": 0.44716874957084657,
"rewards/length_reward": 0.6571428656578064,
"rewards/slop_reward": 0.9142857074737549,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 139.0571487426758,
"epoch": 0.0675,
"grad_norm": 13.874334958965811,
"kl": 0.044140625,
"learning_rate": 9e-07,
"loss": 0.0,
"num_tokens": 693025.0,
"reward": 1.3813447833061219,
"reward_std": 0.4743997871875763,
"rewards/classifier_reward": 0.3313447292894125,
"rewards/length_reward": 0.28571428954601286,
"rewards/slop_reward": 0.7642857193946838,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 312.0857269287109,
"epoch": 0.07,
"grad_norm": 9.049982563385965,
"kl": 0.0622314453125,
"learning_rate": 9.333333333333333e-07,
"loss": 0.0001,
"num_tokens": 721725.0,
"reward": 1.688427746295929,
"reward_std": 0.4802214980125427,
"rewards/classifier_reward": 0.48485626801848414,
"rewards/length_reward": 0.5142857193946838,
"rewards/slop_reward": 0.6892857074737548,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 170.34286499023438,
"epoch": 0.0725,
"grad_norm": 13.386821366059916,
"kl": 0.100341796875,
"learning_rate": 9.666666666666666e-07,
"loss": 0.0001,
"num_tokens": 745420.0,
"reward": 1.488853096961975,
"reward_std": 0.4731449127197266,
"rewards/classifier_reward": 0.2870673179626465,
"rewards/length_reward": 0.31428571939468386,
"rewards/slop_reward": 0.8875,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 250.0571502685547,
"epoch": 0.075,
"grad_norm": 11.769241786475325,
"kl": 0.08681640625,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 771967.0,
"reward": 1.6088370084762573,
"reward_std": 0.5624471366405487,
"rewards/classifier_reward": 0.2784797720611095,
"rewards/length_reward": 0.4857142925262451,
"rewards/slop_reward": 0.8446428537368774,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 203.6285858154297,
"epoch": 0.0775,
"grad_norm": 15.786130440761754,
"kl": 0.120703125,
"learning_rate": 9.999819767255174e-07,
"loss": 0.0001,
"num_tokens": 797014.0,
"reward": 1.9446904182434082,
"reward_std": 0.36648078858852384,
"rewards/classifier_reward": 0.4661188304424286,
"rewards/length_reward": 0.5428571462631225,
"rewards/slop_reward": 0.9357142806053161,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 201.771435546875,
"epoch": 0.08,
"grad_norm": 15.903852139666391,
"kl": 0.3494140625,
"learning_rate": 9.999279082014231e-07,
"loss": 0.0003,
"num_tokens": 821995.0,
"reward": 1.9787015676498414,
"reward_std": 0.585156524181366,
"rewards/classifier_reward": 0.5215587019920349,
"rewards/length_reward": 0.5428571552038193,
"rewards/slop_reward": 0.9142857074737549,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 224.42858276367187,
"epoch": 0.0825,
"grad_norm": 29.64188533235313,
"kl": 0.500390625,
"learning_rate": 9.998377983256848e-07,
"loss": 0.0005,
"num_tokens": 847628.0,
"reward": 2.037756896018982,
"reward_std": 0.5854993224143982,
"rewards/classifier_reward": 0.45918539762496946,
"rewards/length_reward": 0.6857142984867096,
"rewards/slop_reward": 0.8928571343421936,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 213.91429443359374,
"epoch": 0.085,
"grad_norm": 13.883875104448363,
"kl": 1.748828125,
"learning_rate": 9.997116535946027e-07,
"loss": 0.0018,
"num_tokens": 872871.0,
"reward": 1.7733258247375487,
"reward_std": 0.5001925647258758,
"rewards/classifier_reward": 0.4197543442249298,
"rewards/length_reward": 0.6857142955064773,
"rewards/slop_reward": 0.6678571343421936,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 229.771435546875,
"epoch": 0.0875,
"grad_norm": 14.166455798732835,
"kl": 1.48515625,
"learning_rate": 9.995494831023408e-07,
"loss": 0.0015,
"num_tokens": 898833.0,
"reward": 2.2283714771270753,
"reward_std": 0.4383322179317474,
"rewards/classifier_reward": 0.6283714175224304,
"rewards/length_reward": 0.6857142865657806,
"rewards/slop_reward": 0.9142857074737549,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 240.5428680419922,
"epoch": 0.09,
"grad_norm": 20.07276091542321,
"kl": 1.05703125,
"learning_rate": 9.993512985402724e-07,
"loss": 0.0011,
"num_tokens": 924948.0,
"reward": 1.8037936687469482,
"reward_std": 0.4002750262618065,
"rewards/classifier_reward": 0.4395078897476196,
"rewards/length_reward": 0.4285714328289032,
"rewards/slop_reward": 0.9357142806053161,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 249.42858123779297,
"epoch": 0.0925,
"grad_norm": 16.81239416666408,
"kl": 7.96875,
"learning_rate": 9.991171141961368e-07,
"loss": 0.0085,
"num_tokens": 951092.0,
"reward": 1.9397377490997314,
"reward_std": 0.4508820950984955,
"rewards/classifier_reward": 0.6325947523117066,
"rewards/length_reward": 0.4571428656578064,
"rewards/slop_reward": 0.8499999880790711,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 196.17143707275392,
"epoch": 0.095,
"grad_norm": 19.19953442548882,
"kl": 3.775,
"learning_rate": 9.988469469530085e-07,
"loss": 0.0038,
"num_tokens": 975652.0,
"reward": 2.0388022661209106,
"reward_std": 0.3248827219009399,
"rewards/classifier_reward": 0.32451642835512756,
"rewards/length_reward": 0.7142857193946839,
"rewards/slop_reward": 1.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 352.22858276367185,
"epoch": 0.0975,
"grad_norm": 18.25441612913441,
"kl": 0.79091796875,
"learning_rate": 9.985408162880813e-07,
"loss": 0.0008,
"num_tokens": 1005900.0,
"reward": 1.6981183767318726,
"reward_std": 0.433578160405159,
"rewards/classifier_reward": 0.5351718068122864,
"rewards/length_reward": 0.40000000298023225,
"rewards/slop_reward": 0.7629464268684387,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 321.02858276367186,
"epoch": 0.1,
"grad_norm": 10.54620387526757,
"kl": 0.754296875,
"learning_rate": 9.98198744271263e-07,
"loss": 0.0008,
"num_tokens": 1034877.0,
"reward": 1.9676434993743896,
"reward_std": 0.4024129122495651,
"rewards/classifier_reward": 0.6390719175338745,
"rewards/length_reward": 0.4571428656578064,
"rewards/slop_reward": 0.8714285612106323,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 214.17143859863282,
"epoch": 0.1025,
"grad_norm": 13.824754472854506,
"kl": 0.911328125,
"learning_rate": 9.978207555635855e-07,
"loss": 0.0009,
"num_tokens": 1060157.0,
"reward": 2.1357697248458862,
"reward_std": 0.5395443201065063,
"rewards/classifier_reward": 0.6357696294784546,
"rewards/length_reward": 0.5428571552038193,
"rewards/slop_reward": 0.9571428537368775,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 223.6285827636719,
"epoch": 0.105,
"grad_norm": 18.073667394788664,
"kl": 0.519921875,
"learning_rate": 9.97406877415425e-07,
"loss": 0.0005,
"num_tokens": 1085893.0,
"reward": 2.068192982673645,
"reward_std": 0.4686335951089859,
"rewards/classifier_reward": 0.4967642992734909,
"rewards/length_reward": 0.6571428775787354,
"rewards/slop_reward": 0.9142857074737549,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 297.40001220703124,
"epoch": 0.1075,
"grad_norm": 10.8725727731831,
"kl": 0.43515625,
"learning_rate": 9.9695713966454e-07,
"loss": 0.0004,
"num_tokens": 1114056.0,
"reward": 1.6727801322937013,
"reward_std": 0.501282411813736,
"rewards/classifier_reward": 0.501351535320282,
"rewards/length_reward": 0.3428571492433548,
"rewards/slop_reward": 0.8285714149475097,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 204.85715026855468,
"epoch": 0.11,
"grad_norm": 11.442387542173964,
"kl": 0.574609375,
"learning_rate": 9.964715747339175e-07,
"loss": 0.0006,
"num_tokens": 1138804.0,
"reward": 2.027732276916504,
"reward_std": 0.6377828001976014,
"rewards/classifier_reward": 0.6545179545879364,
"rewards/length_reward": 0.48571430146694183,
"rewards/slop_reward": 0.8875,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 290.6285888671875,
"epoch": 0.1125,
"grad_norm": 11.010885455058528,
"kl": 0.451171875,
"learning_rate": 9.959502176294382e-07,
"loss": 0.0005,
"num_tokens": 1166842.0,
"reward": 1.8717997074127197,
"reward_std": 0.45424606651067734,
"rewards/classifier_reward": 0.45037112236022947,
"rewards/length_reward": 0.4857142925262451,
"rewards/slop_reward": 0.9357142806053161,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 251.60001220703126,
"epoch": 0.115,
"grad_norm": 10.74794734294439,
"kl": 0.378125,
"learning_rate": 9.953931059373511e-07,
"loss": 0.0004,
"num_tokens": 1193568.0,
"reward": 2.1009025812149047,
"reward_std": 0.5576321303844451,
"rewards/classifier_reward": 0.6455453038215637,
"rewards/length_reward": 0.6000000059604644,
"rewards/slop_reward": 0.8553571462631225,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 205.51429595947266,
"epoch": 0.1175,
"grad_norm": 12.174860136090478,
"kl": 0.580859375,
"learning_rate": 9.948002798215631e-07,
"loss": 0.0006,
"num_tokens": 1218520.0,
"reward": 1.7478339910507201,
"reward_std": 0.44800390899181364,
"rewards/classifier_reward": 0.5246196419000626,
"rewards/length_reward": 0.4000000059604645,
"rewards/slop_reward": 0.8232142806053162,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 206.371435546875,
"epoch": 0.12,
"grad_norm": 10.734345225393307,
"kl": 0.77734375,
"learning_rate": 9.94171782020746e-07,
"loss": 0.0008,
"num_tokens": 1243553.0,
"reward": 2.33663330078125,
"reward_std": 0.5126991689205169,
"rewards/classifier_reward": 0.6152046620845795,
"rewards/length_reward": 0.7428571522235871,
"rewards/slop_reward": 0.9785714268684387,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 349.8000244140625,
"epoch": 0.1225,
"grad_norm": 9.516105487183257,
"kl": 0.6921875,
"learning_rate": 9.935076578452534e-07,
"loss": 0.0007,
"num_tokens": 1273677.0,
"reward": 1.6252358913421632,
"reward_std": 0.4947394013404846,
"rewards/classifier_reward": 0.5180929381400347,
"rewards/length_reward": 0.25714286267757414,
"rewards/slop_reward": 0.85,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 229.57143859863282,
"epoch": 0.125,
"grad_norm": 11.582425399894849,
"kl": 1.07734375,
"learning_rate": 9.928079551738541e-07,
"loss": 0.0011,
"num_tokens": 1299334.0,
"reward": 2.2410747528076174,
"reward_std": 0.5065959393978119,
"rewards/classifier_reward": 0.5834853827953339,
"rewards/length_reward": 0.6857143044471741,
"rewards/slop_reward": 0.971875,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 254.68572692871095,
"epoch": 0.1275,
"grad_norm": 9.691651370929392,
"kl": 0.7765625,
"learning_rate": 9.920727244502818e-07,
"loss": 0.0008,
"num_tokens": 1326112.0,
"reward": 1.8951802968978881,
"reward_std": 0.561316728591919,
"rewards/classifier_reward": 0.4666088119149208,
"rewards/length_reward": 0.5142857283353806,
"rewards/slop_reward": 0.9142857074737549,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 245.82858581542968,
"epoch": 0.13,
"grad_norm": 9.231142781375425,
"kl": 0.4640625,
"learning_rate": 9.913020186795966e-07,
"loss": 0.0005,
"num_tokens": 1352635.0,
"reward": 2.4359071254730225,
"reward_std": 0.504255086183548,
"rewards/classifier_reward": 0.8644783020019531,
"rewards/length_reward": 0.6571428656578064,
"rewards/slop_reward": 0.9142857074737549,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 336.2857360839844,
"epoch": 0.1325,
"grad_norm": 8.070502355443343,
"kl": 1.01142578125,
"learning_rate": 9.904958934243653e-07,
"loss": 0.001,
"num_tokens": 1382325.0,
"reward": 1.9690003395080566,
"reward_std": 0.4140917003154755,
"rewards/classifier_reward": 0.595339572429657,
"rewards/length_reward": 0.5142857253551483,
"rewards/slop_reward": 0.859375,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 176.85714721679688,
"epoch": 0.135,
"grad_norm": 13.273584784804553,
"kl": 0.737109375,
"learning_rate": 9.89654406800655e-07,
"loss": 0.0007,
"num_tokens": 1406364.0,
"reward": 2.015079474449158,
"reward_std": 0.34893424808979034,
"rewards/classifier_reward": 0.6650793373584747,
"rewards/length_reward": 0.37142857611179353,
"rewards/slop_reward": 0.9785714268684387,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 270.77144317626954,
"epoch": 0.1375,
"grad_norm": 10.259811273773323,
"kl": 0.41875,
"learning_rate": 9.887776194738431e-07,
"loss": 0.0004,
"num_tokens": 1433517.0,
"reward": 2.0523552179336546,
"reward_std": 0.43476098477840425,
"rewards/classifier_reward": 0.72378368973732,
"rewards/length_reward": 0.37142857909202576,
"rewards/slop_reward": 0.9571428537368775,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 270.20001220703125,
"epoch": 0.14,
"grad_norm": 98.33006992571028,
"kl": 25.924609375,
"learning_rate": 9.878655946542442e-07,
"loss": 0.0258,
"num_tokens": 1460894.0,
"reward": 2.2650604486465453,
"reward_std": 0.5314578056335449,
"rewards/classifier_reward": 0.7382745862007141,
"rewards/length_reward": 0.6285714358091354,
"rewards/slop_reward": 0.8982142806053162,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 213.82858581542968,
"epoch": 0.1425,
"grad_norm": 11.336151350522224,
"kl": 0.592578125,
"learning_rate": 9.86918398092553e-07,
"loss": 0.0006,
"num_tokens": 1486239.0,
"reward": 2.248945116996765,
"reward_std": 0.4141096830368042,
"rewards/classifier_reward": 0.6918022215366364,
"rewards/length_reward": 0.6000000059604644,
"rewards/slop_reward": 0.9571428537368775,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 301.85715637207034,
"epoch": 0.145,
"grad_norm": 9.956188328948622,
"kl": 0.44765625,
"learning_rate": 9.85936098075104e-07,
"loss": 0.0004,
"num_tokens": 1514661.0,
"reward": 1.9887679100036622,
"reward_std": 0.47130251824855807,
"rewards/classifier_reward": 0.6173391878604889,
"rewards/length_reward": 0.45714286267757415,
"rewards/slop_reward": 0.9142857074737549,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 194.42857971191407,
"epoch": 0.1475,
"grad_norm": 10.413710834913637,
"kl": 0.428125,
"learning_rate": 9.849187654189485e-07,
"loss": 0.0004,
"num_tokens": 1539249.0,
"reward": 2.1201124668121336,
"reward_std": 0.4662812829017639,
"rewards/classifier_reward": 0.8486838459968566,
"rewards/length_reward": 0.3142857253551483,
"rewards/slop_reward": 0.9571428537368775,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 223.71429443359375,
"epoch": 0.15,
"grad_norm": 9.48253169035506,
"kl": 0.6697265625,
"learning_rate": 9.838664734667495e-07,
"loss": 0.0007,
"num_tokens": 1564932.0,
"reward": 2.3097215414047243,
"reward_std": 0.37846060991287234,
"rewards/classifier_reward": 0.6097213685512543,
"rewards/length_reward": 0.7428571492433548,
"rewards/slop_reward": 0.9571428537368775,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 213.08572235107422,
"epoch": 0.1525,
"grad_norm": 17.056291850314448,
"kl": 0.484375,
"learning_rate": 9.827792980814933e-07,
"loss": 0.0005,
"num_tokens": 1590245.0,
"reward": 2.139114594459534,
"reward_std": 0.3371311604976654,
"rewards/classifier_reward": 0.6605431139469147,
"rewards/length_reward": 0.5428571492433548,
"rewards/slop_reward": 0.9357142806053161,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 224.68572540283202,
"epoch": 0.155,
"grad_norm": 9.724576982946711,
"kl": 0.580859375,
"learning_rate": 9.81657317641022e-07,
"loss": 0.0006,
"num_tokens": 1615704.0,
"reward": 2.022816562652588,
"reward_std": 0.4144854575395584,
"rewards/classifier_reward": 0.5585307866334915,
"rewards/length_reward": 0.4857142955064774,
"rewards/slop_reward": 0.9785714268684387,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 343.8285888671875,
"epoch": 0.1575,
"grad_norm": 7.143874630694377,
"kl": 0.4015625,
"learning_rate": 9.805006130323808e-07,
"loss": 0.0004,
"num_tokens": 1645654.0,
"reward": 2.0719400882720946,
"reward_std": 0.3876799166202545,
"rewards/classifier_reward": 0.6647971898317337,
"rewards/length_reward": 0.42857143878936765,
"rewards/slop_reward": 0.9785714268684387,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 266.00001220703126,
"epoch": 0.16,
"grad_norm": 9.518081765887688,
"kl": 0.3986328125,
"learning_rate": 9.793092676459888e-07,
"loss": 0.0004,
"num_tokens": 1672883.0,
"reward": 2.084528160095215,
"reward_std": 0.473650124669075,
"rewards/classifier_reward": 0.584528061747551,
"rewards/length_reward": 0.5428571552038193,
"rewards/slop_reward": 0.9571428537368775,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 279.71429748535155,
"epoch": 0.1625,
"grad_norm": 8.015308302511995,
"kl": 0.54375,
"learning_rate": 9.780833673696254e-07,
"loss": 0.0005,
"num_tokens": 1700438.0,
"reward": 2.3380573272705076,
"reward_std": 0.4121220216155052,
"rewards/classifier_reward": 0.8237714767456055,
"rewards/length_reward": 0.5142857223749161,
"rewards/slop_reward": 1.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 248.31430053710938,
"epoch": 0.165,
"grad_norm": 8.749621821849056,
"kl": 0.421484375,
"learning_rate": 9.768230005822393e-07,
"loss": 0.0004,
"num_tokens": 1727047.0,
"reward": 2.2453027963638306,
"reward_std": 0.38129588067531583,
"rewards/classifier_reward": 0.781016880273819,
"rewards/length_reward": 0.48571428954601287,
"rewards/slop_reward": 0.9785714268684387,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 283.3142974853516,
"epoch": 0.1675,
"grad_norm": 15.835663084851635,
"kl": 0.4734375,
"learning_rate": 9.755282581475767e-07,
"loss": 0.0005,
"num_tokens": 1754863.0,
"reward": 2.350207304954529,
"reward_std": 0.5121555209159852,
"rewards/classifier_reward": 0.7930643558502197,
"rewards/length_reward": 0.6000000149011612,
"rewards/slop_reward": 0.9571428537368775,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 207.0571502685547,
"epoch": 0.17,
"grad_norm": 10.560189239731793,
"kl": 0.418359375,
"learning_rate": 9.741992334076308e-07,
"loss": 0.0004,
"num_tokens": 1780017.0,
"reward": 2.56486029624939,
"reward_std": 0.4183764517307281,
"rewards/classifier_reward": 0.7505744874477387,
"rewards/length_reward": 0.8571428656578064,
"rewards/slop_reward": 0.9571428537368775,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 207.31429443359374,
"epoch": 0.1725,
"grad_norm": 12.483812322806989,
"kl": 0.690234375,
"learning_rate": 9.728360221759123e-07,
"loss": 0.0007,
"num_tokens": 1805172.0,
"reward": 2.421455478668213,
"reward_std": 0.5775963604450226,
"rewards/classifier_reward": 0.7643125176429748,
"rewards/length_reward": 0.7428571522235871,
"rewards/slop_reward": 0.9142857074737549,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 262.8285858154297,
"epoch": 0.175,
"grad_norm": 10.184314056992156,
"kl": 0.665234375,
"learning_rate": 9.71438722730542e-07,
"loss": 0.0007,
"num_tokens": 1832291.0,
"reward": 2.078964352607727,
"reward_std": 0.5653822362422943,
"rewards/classifier_reward": 0.6146785497665406,
"rewards/length_reward": 0.4857142984867096,
"rewards/slop_reward": 0.9785714268684387,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 338.9428741455078,
"epoch": 0.1775,
"grad_norm": 8.806960986372623,
"kl": 0.46015625,
"learning_rate": 9.700074358071656e-07,
"loss": 0.0005,
"num_tokens": 1861997.0,
"reward": 2.0550333499908446,
"reward_std": 0.5070074677467347,
"rewards/classifier_reward": 0.7263502657413483,
"rewards/length_reward": 0.40000001192092893,
"rewards/slop_reward": 0.9286830306053162,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 270.0857299804687,
"epoch": 0.18,
"grad_norm": 9.059350132300025,
"kl": 0.450390625,
"learning_rate": 9.685422645916918e-07,
"loss": 0.0005,
"num_tokens": 1888854.0,
"reward": 2.531459331512451,
"reward_std": 0.3892214775085449,
"rewards/classifier_reward": 0.860030734539032,
"rewards/length_reward": 0.7142857193946839,
"rewards/slop_reward": 0.9571428537368775,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 291.4285827636719,
"epoch": 0.1825,
"grad_norm": 12.813964228104469,
"kl": 0.52265625,
"learning_rate": 9.670433147128521e-07,
"loss": 0.0005,
"num_tokens": 1916974.0,
"reward": 2.2648436784744264,
"reward_std": 0.40854659080505373,
"rewards/classifier_reward": 0.8577007412910461,
"rewards/length_reward": 0.42857143878936765,
"rewards/slop_reward": 0.9785714268684387,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 246.31429748535157,
"epoch": 0.185,
"grad_norm": 241.73474838722862,
"kl": 0.453125,
"learning_rate": 9.655106942345868e-07,
"loss": 0.0005,
"num_tokens": 1943440.0,
"reward": 2.366797590255737,
"reward_std": 0.41730722188949587,
"rewards/classifier_reward": 0.766797399520874,
"rewards/length_reward": 0.600000011920929,
"rewards/slop_reward": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 250.91429443359374,
"epoch": 0.1875,
"grad_norm": 8.395335843546244,
"kl": 0.5015625,
"learning_rate": 9.639445136482546e-07,
"loss": 0.0005,
"num_tokens": 1969900.0,
"reward": 2.4618414878845214,
"reward_std": 0.540682977437973,
"rewards/classifier_reward": 0.8261271595954895,
"rewards/length_reward": 0.6571428716182709,
"rewards/slop_reward": 0.9785714268684387,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 247.11429748535156,
"epoch": 0.19,
"grad_norm": 9.31969072416839,
"kl": 0.53671875,
"learning_rate": 9.623448858646656e-07,
"loss": 0.0005,
"num_tokens": 1996327.0,
"reward": 2.259377145767212,
"reward_std": 0.4671668648719788,
"rewards/classifier_reward": 0.916519820690155,
"rewards/length_reward": 0.3428571581840515,
"rewards/slop_reward": 1.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 316.9714416503906,
"epoch": 0.1925,
"grad_norm": 7.365001527017238,
"kl": 1.408984375,
"learning_rate": 9.607119262059425e-07,
"loss": 0.0014,
"num_tokens": 2024968.0,
"reward": 2.266000509262085,
"reward_std": 0.5217303335666656,
"rewards/classifier_reward": 0.8231433391571045,
"rewards/length_reward": 0.48571430146694183,
"rewards/slop_reward": 0.9571428537368775,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 228.62857971191406,
"epoch": 0.195,
"grad_norm": 45.20061919834423,
"kl": 23.44453125,
"learning_rate": 9.590457523972055e-07,
"loss": 0.0236,
"num_tokens": 2050735.0,
"reward": 2.3131242275238035,
"reward_std": 0.5057340741157532,
"rewards/classifier_reward": 0.8827670216560364,
"rewards/length_reward": 0.4571428656578064,
"rewards/slop_reward": 0.9732142806053161,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 217.91429443359374,
"epoch": 0.1975,
"grad_norm": 9.435990839066255,
"kl": 0.4484375,
"learning_rate": 9.573464845580863e-07,
"loss": 0.0004,
"num_tokens": 2076160.0,
"reward": 2.3861000537872314,
"reward_std": 0.6137513637542724,
"rewards/classifier_reward": 0.8432427525520325,
"rewards/length_reward": 0.5428571581840516,
"rewards/slop_reward": 1.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 250.91429443359374,
"epoch": 0.2,
"grad_norm": 7.98212579749696,
"kl": 0.5078125,
"learning_rate": 9.556142451940679e-07,
"loss": 0.0005,
"num_tokens": 2102862.0,
"reward": 2.2402546644210815,
"reward_std": 0.602302199602127,
"rewards/classifier_reward": 0.7473974108695984,
"rewards/length_reward": 0.600000011920929,
"rewards/slop_reward": 0.8928571343421936,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 287.34287414550784,
"epoch": 0.2025,
"grad_norm": 8.04736588295688,
"kl": 0.61015625,
"learning_rate": 9.53849159187652e-07,
"loss": 0.0006,
"num_tokens": 2130556.0,
"reward": 2.3916960954666138,
"reward_std": 0.45145381689071656,
"rewards/classifier_reward": 0.9631245970726013,
"rewards/length_reward": 0.5142857253551483,
"rewards/slop_reward": 0.9142857074737549,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 264.02857666015626,
"epoch": 0.205,
"grad_norm": 8.203747363662693,
"kl": 0.523828125,
"learning_rate": 9.520513537893573e-07,
"loss": 0.0005,
"num_tokens": 2157656.0,
"reward": 2.4363236665725707,
"reward_std": 0.32723597437143326,
"rewards/classifier_reward": 0.7720378637313843,
"rewards/length_reward": 0.6857142925262452,
"rewards/slop_reward": 0.9785714268684387,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 229.02858276367186,
"epoch": 0.2075,
"grad_norm": 7.89698513820585,
"kl": 0.556640625,
"learning_rate": 9.502209586085442e-07,
"loss": 0.0006,
"num_tokens": 2183592.0,
"reward": 2.6267528533935547,
"reward_std": 0.41746888160705564,
"rewards/classifier_reward": 0.9338955879211426,
"rewards/length_reward": 0.7142857253551483,
"rewards/slop_reward": 0.9785714268684387,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 224.37144165039064,
"epoch": 0.21,
"grad_norm": 8.428058041901203,
"kl": 0.534765625,
"learning_rate": 9.483581056040718e-07,
"loss": 0.0005,
"num_tokens": 2209201.0,
"reward": 2.4542306900024413,
"reward_std": 0.4101273000240326,
"rewards/classifier_reward": 0.9113734126091003,
"rewards/length_reward": 0.5428571552038193,
"rewards/slop_reward": 1.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 257.71429443359375,
"epoch": 0.2125,
"grad_norm": 9.035933890702934,
"kl": 0.5140625,
"learning_rate": 9.464629290747842e-07,
"loss": 0.0005,
"num_tokens": 2236141.0,
"reward": 2.5825096130371095,
"reward_std": 0.45454559922218324,
"rewards/classifier_reward": 0.9753666520118713,
"rewards/length_reward": 0.6285714417695999,
"rewards/slop_reward": 0.9785714268684387,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 246.74286804199218,
"epoch": 0.215,
"grad_norm": 9.318935519161128,
"kl": 0.555078125,
"learning_rate": 9.445355656498284e-07,
"loss": 0.0006,
"num_tokens": 2262697.0,
"reward": 2.383002519607544,
"reward_std": 0.5029416978359222,
"rewards/classifier_reward": 0.8830024480819703,
"rewards/length_reward": 0.5428571611642837,
"rewards/slop_reward": 0.9571428537368775,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 324.9428741455078,
"epoch": 0.2175,
"grad_norm": 10.476800992230684,
"kl": 0.515234375,
"learning_rate": 9.425761542788048e-07,
"loss": 0.001,
"num_tokens": 2291611.0,
"reward": 2.4253102779388427,
"reward_std": 0.25630177855491637,
"rewards/classifier_reward": 0.9878101944923401,
"rewards/length_reward": 0.4857142955064774,
"rewards/slop_reward": 0.9517857193946838,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 285.88572387695314,
"epoch": 0.22,
"grad_norm": 8.262503873333891,
"kl": 0.623828125,
"learning_rate": 9.40584836221749e-07,
"loss": 0.0006,
"num_tokens": 2319400.0,
"reward": 2.459568977355957,
"reward_std": 0.5227903485298157,
"rewards/classifier_reward": 0.9595688700675964,
"rewards/length_reward": 0.5428571581840516,
"rewards/slop_reward": 0.9571428537368775,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 361.48572998046876,
"epoch": 0.2225,
"grad_norm": 8.639609161679585,
"kl": 0.61015625,
"learning_rate": 9.385617550389489e-07,
"loss": 0.0006,
"num_tokens": 2349972.0,
"reward": 2.21625759601593,
"reward_std": 0.43572868704795836,
"rewards/classifier_reward": 0.8430432677268982,
"rewards/length_reward": 0.4000000089406967,
"rewards/slop_reward": 0.9732142806053161,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 230.5428680419922,
"epoch": 0.225,
"grad_norm": 9.545010126368608,
"kl": 0.606640625,
"learning_rate": 9.36507056580594e-07,
"loss": 0.0006,
"num_tokens": 2375941.0,
"reward": 2.5845241069793703,
"reward_std": 0.4674242250621319,
"rewards/classifier_reward": 0.9559526205062866,
"rewards/length_reward": 0.6285714387893677,
"rewards/slop_reward": 1.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 255.971435546875,
"epoch": 0.2275,
"grad_norm": 8.313767260844978,
"kl": 0.705859375,
"learning_rate": 9.34420888976262e-07,
"loss": 0.0007,
"num_tokens": 2402820.0,
"reward": 2.281861972808838,
"reward_std": 0.5015565395355225,
"rewards/classifier_reward": 0.9032904744148255,
"rewards/length_reward": 0.48571430444717406,
"rewards/slop_reward": 0.8928571343421936,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 285.71430053710935,
"epoch": 0.23,
"grad_norm": 15.296685695089757,
"kl": 12.5765625,
"learning_rate": 9.323034026242377e-07,
"loss": 0.0126,
"num_tokens": 2430740.0,
"reward": 2.457769823074341,
"reward_std": 0.4592562437057495,
"rewards/classifier_reward": 0.9363411664962769,
"rewards/length_reward": 0.542857152223587,
"rewards/slop_reward": 0.9785714268684387,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 283.9143035888672,
"epoch": 0.2325,
"grad_norm": 41.32916438788783,
"kl": 0.500390625,
"learning_rate": 9.301547501806724e-07,
"loss": 0.0005,
"num_tokens": 2458445.0,
"reward": 2.432806062698364,
"reward_std": 0.4543603718280792,
"rewards/classifier_reward": 0.8256630301475525,
"rewards/length_reward": 0.6285714387893677,
"rewards/slop_reward": 0.9785714268684387,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 231.2571533203125,
"epoch": 0.235,
"grad_norm": 8.514937250494981,
"kl": 0.56875,
"learning_rate": 9.279750865485772e-07,
"loss": 0.001,
"num_tokens": 2484459.0,
"reward": 2.6583719730377195,
"reward_std": 0.41457981467247007,
"rewards/classifier_reward": 0.9726575374603271,
"rewards/length_reward": 0.6857142984867096,
"rewards/slop_reward": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 239.4571563720703,
"epoch": 0.2375,
"grad_norm": 8.130278831610969,
"kl": 0.586328125,
"learning_rate": 9.257645688666555e-07,
"loss": 0.0006,
"num_tokens": 2510647.0,
"reward": 2.552411127090454,
"reward_std": 0.444735050201416,
"rewards/classifier_reward": 0.9220538377761841,
"rewards/length_reward": 0.6571428656578064,
"rewards/slop_reward": 0.9732142806053161,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 248.48572692871093,
"epoch": 0.24,
"grad_norm": 11.012318216722969,
"kl": 0.559375,
"learning_rate": 9.235233564979754e-07,
"loss": 0.0006,
"num_tokens": 2537166.0,
"reward": 2.3741667747497557,
"reward_std": 0.5231991052627564,
"rewards/classifier_reward": 0.7955952703952789,
"rewards/length_reward": 0.600000011920929,
"rewards/slop_reward": 0.9785714268684387,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 242.91429748535157,
"epoch": 0.2425,
"grad_norm": 9.607401037517901,
"kl": 0.503125,
"learning_rate": 9.212516110184794e-07,
"loss": 0.0005,
"num_tokens": 2563588.0,
"reward": 2.6350881576538088,
"reward_std": 0.47173853516578673,
"rewards/classifier_reward": 0.8618737578392028,
"rewards/length_reward": 0.8000000059604645,
"rewards/slop_reward": 0.9732142806053161,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 193.971435546875,
"epoch": 0.245,
"grad_norm": 10.623433793787921,
"kl": 0.7234375,
"learning_rate": 9.189494962053368e-07,
"loss": 0.0007,
"num_tokens": 2588297.0,
"reward": 2.389581322669983,
"reward_std": 0.4812875479459763,
"rewards/classifier_reward": 0.903866958618164,
"rewards/length_reward": 0.4857142984867096,
"rewards/slop_reward": 1.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 171.51429443359376,
"epoch": 0.2475,
"grad_norm": 11.438435269316447,
"kl": 0.64375,
"learning_rate": 9.166171780251364e-07,
"loss": 0.0006,
"num_tokens": 2612015.0,
"reward": 2.513836717605591,
"reward_std": 0.26052397638559344,
"rewards/classifier_reward": 0.8852650642395019,
"rewards/length_reward": 0.6285714328289032,
"rewards/slop_reward": 1.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 167.14286346435546,
"epoch": 0.25,
"grad_norm": 11.041028322429328,
"kl": 0.7359375,
"learning_rate": 9.14254824621921e-07,
"loss": 0.0007,
"num_tokens": 2635785.0,
"reward": 2.355943202972412,
"reward_std": 0.33155601024627684,
"rewards/classifier_reward": 0.8416573882102967,
"rewards/length_reward": 0.5142857193946838,
"rewards/slop_reward": 1.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 163.74286499023438,
"epoch": 0.2525,
"grad_norm": 9.00301084087431,
"kl": 0.75390625,
"learning_rate": 9.118626063050661e-07,
"loss": 0.0012,
"num_tokens": 2659436.0,
"reward": 2.5006046295166016,
"reward_std": 0.4252849280834198,
"rewards/classifier_reward": 0.9506044745445251,
"rewards/length_reward": 0.571428582072258,
"rewards/slop_reward": 0.9785714268684387,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 195.42858276367187,
"epoch": 0.255,
"grad_norm": 9.702866580028527,
"kl": 0.79921875,
"learning_rate": 9.094406955370008e-07,
"loss": 0.0008,
"num_tokens": 2683861.0,
"reward": 2.5624767780303954,
"reward_std": 0.47246721386909485,
"rewards/classifier_reward": 0.9267622709274292,
"rewards/length_reward": 0.6571428716182709,
"rewards/slop_reward": 0.9785714268684387,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 183.8857223510742,
"epoch": 0.2575,
"grad_norm": 7.999927126827247,
"kl": 0.7359375,
"learning_rate": 9.069892669207757e-07,
"loss": 0.0007,
"num_tokens": 2708217.0,
"reward": 2.4374377012252806,
"reward_std": 0.3513069462031126,
"rewards/classifier_reward": 0.8374375879764557,
"rewards/length_reward": 0.6000000059604644,
"rewards/slop_reward": 1.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 203.74286804199218,
"epoch": 0.26,
"grad_norm": 8.535286841601573,
"kl": 0.6046875,
"learning_rate": 9.045084971874737e-07,
"loss": 0.0006,
"num_tokens": 2733039.0,
"reward": 2.590514373779297,
"reward_std": 0.3418044149875641,
"rewards/classifier_reward": 0.7905142605304718,
"rewards/length_reward": 0.8000000059604645,
"rewards/slop_reward": 1.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 186.88572387695314,
"epoch": 0.2625,
"grad_norm": 11.150131541909593,
"kl": 2.3078125,
"learning_rate": 9.019985651834703e-07,
"loss": 0.0023,
"num_tokens": 2757500.0,
"reward": 2.6002991676330565,
"reward_std": 0.47049993872642515,
"rewards/classifier_reward": 0.850299060344696,
"rewards/length_reward": 0.7714285790920258,
"rewards/slop_reward": 0.9785714268684387,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 231.48572692871093,
"epoch": 0.265,
"grad_norm": 7.346740346808247,
"kl": 0.694140625,
"learning_rate": 8.994596518575391e-07,
"loss": 0.0007,
"num_tokens": 2783522.0,
"reward": 2.6393914222717285,
"reward_std": 0.4599771976470947,
"rewards/classifier_reward": 0.8965341806411743,
"rewards/length_reward": 0.7428571581840515,
"rewards/slop_reward": 1.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 293.51429748535156,
"epoch": 0.2675,
"grad_norm": 7.782363877670758,
"kl": 0.71328125,
"learning_rate": 8.968919402478075e-07,
"loss": 0.0007,
"num_tokens": 2811715.0,
"reward": 2.497778224945068,
"reward_std": 0.49988613873720167,
"rewards/classifier_reward": 0.8692066669464111,
"rewards/length_reward": 0.6285714387893677,
"rewards/slop_reward": 1.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 202.40000915527344,
"epoch": 0.27,
"grad_norm": 8.407812702495294,
"kl": 0.9671875,
"learning_rate": 8.942956154685595e-07,
"loss": 0.001,
"num_tokens": 2836377.0,
"reward": 2.7594990730285645,
"reward_std": 0.35219337940216067,
"rewards/classifier_reward": 0.9237847208976746,
"rewards/length_reward": 0.8571428656578064,
"rewards/slop_reward": 0.9785714268684387,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 311.14288024902345,
"epoch": 0.2725,
"grad_norm": 10.954001826900827,
"kl": 0.803515625,
"learning_rate": 8.916708646968923e-07,
"loss": 0.0008,
"num_tokens": 2865187.0,
"reward": 2.3341631174087523,
"reward_std": 0.3795748669654131,
"rewards/classifier_reward": 0.6913058979436755,
"rewards/length_reward": 0.6857142925262452,
"rewards/slop_reward": 0.9571428537368775,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 295.88572387695314,
"epoch": 0.275,
"grad_norm": 7.24224369671047,
"kl": 0.6296875,
"learning_rate": 8.890178771592197e-07,
"loss": 0.0006,
"num_tokens": 2893081.0,
"reward": 2.574794292449951,
"reward_std": 0.4517861694097519,
"rewards/classifier_reward": 0.9319370150566101,
"rewards/length_reward": 0.6857142984867096,
"rewards/slop_reward": 0.9571428537368775,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 310.4285858154297,
"epoch": 0.2775,
"grad_norm": 6.977209722960256,
"kl": 1.34609375,
"learning_rate": 8.863368441176325e-07,
"loss": 0.0013,
"num_tokens": 2921771.0,
"reward": 2.2888038635253904,
"reward_std": 0.6448704779148102,
"rewards/classifier_reward": 0.8455002188682557,
"rewards/length_reward": 0.5142857253551483,
"rewards/slop_reward": 0.9290178537368774,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 251.82858276367188,
"epoch": 0.28,
"grad_norm": 6.8381698154250525,
"kl": 0.86640625,
"learning_rate": 8.836279588561081e-07,
"loss": 0.0009,
"num_tokens": 2948383.0,
"reward": 2.4712666511535644,
"reward_std": 0.4860814154148102,
"rewards/classifier_reward": 0.8141236484050751,
"rewards/length_reward": 0.6571428716182709,
"rewards/slop_reward": 1.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 275.74287109375,
"epoch": 0.2825,
"grad_norm": 5.967234970457837,
"kl": 0.8515625,
"learning_rate": 8.808914166665772e-07,
"loss": 0.0013,
"num_tokens": 2975877.0,
"reward": 2.6856667041778564,
"reward_std": 0.38376912772655486,
"rewards/classifier_reward": 0.9856665849685669,
"rewards/length_reward": 0.7428571522235871,
"rewards/slop_reward": 0.9571428537368775,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 291.1428680419922,
"epoch": 0.285,
"grad_norm": 6.696088875243213,
"kl": 0.61484375,
"learning_rate": 8.781274148348436e-07,
"loss": 0.0006,
"num_tokens": 3003901.0,
"reward": 2.584765911102295,
"reward_std": 0.441849821805954,
"rewards/classifier_reward": 0.8704800248146057,
"rewards/length_reward": 0.7142857313156128,
"rewards/slop_reward": 1.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 286.6000152587891,
"epoch": 0.2875,
"grad_norm": 5.913822453334042,
"kl": 0.678125,
"learning_rate": 8.753361526263621e-07,
"loss": 0.0007,
"num_tokens": 3031852.0,
"reward": 2.6028482913970947,
"reward_std": 0.2958831213414669,
"rewards/classifier_reward": 0.9957053542137146,
"rewards/length_reward": 0.6285714387893677,
"rewards/slop_reward": 0.9785714268684387,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 292.20001525878905,
"epoch": 0.29,
"grad_norm": 5.738716860165764,
"kl": 0.72578125,
"learning_rate": 8.725178312718725e-07,
"loss": 0.0012,
"num_tokens": 3059999.0,
"reward": 2.596400237083435,
"reward_std": 0.3507813632488251,
"rewards/classifier_reward": 0.953542971611023,
"rewards/length_reward": 0.6857142925262452,
"rewards/slop_reward": 0.9571428537368775,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 239.1428680419922,
"epoch": 0.2925,
"grad_norm": 69.40308583594562,
"kl": 1.6,
"learning_rate": 8.696726539528923e-07,
"loss": 0.0021,
"num_tokens": 3086289.0,
"reward": 2.7766035079956053,
"reward_std": 0.3328893929719925,
"rewards/classifier_reward": 0.8908890843391418,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 1.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 259.22858276367185,
"epoch": 0.295,
"grad_norm": 5.40703806614853,
"kl": 0.734765625,
"learning_rate": 8.668008257870682e-07,
"loss": 0.0012,
"num_tokens": 3113282.0,
"reward": 2.7762694358825684,
"reward_std": 0.2866129666566849,
"rewards/classifier_reward": 0.9476978421211243,
"rewards/length_reward": 0.8285714328289032,
"rewards/slop_reward": 1.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 201.91429443359374,
"epoch": 0.2975,
"grad_norm": 7.252699872821293,
"kl": 0.81015625,
"learning_rate": 8.639025538133897e-07,
"loss": 0.0013,
"num_tokens": 3138256.0,
"reward": 2.851440095901489,
"reward_std": 0.20830639004707335,
"rewards/classifier_reward": 0.9085827589035034,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 250.20001525878905,
"epoch": 0.3,
"grad_norm": 6.062851209216701,
"kl": 0.92578125,
"learning_rate": 8.609780469772621e-07,
"loss": 0.0014,
"num_tokens": 3164933.0,
"reward": 2.786311960220337,
"reward_std": 0.29145972728729247,
"rewards/classifier_reward": 0.9005975008010865,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 1.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 199.4857208251953,
"epoch": 0.3025,
"grad_norm": 8.363986536112053,
"kl": 0.746875,
"learning_rate": 8.580275161154431e-07,
"loss": 0.0007,
"num_tokens": 3189764.0,
"reward": 2.89967794418335,
"reward_std": 0.15721405297517776,
"rewards/classifier_reward": 0.9496778607368469,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 218.88572387695314,
"epoch": 0.305,
"grad_norm": 104.91081313736798,
"kl": 17.3171875,
"learning_rate": 8.550511739408428e-07,
"loss": 0.0182,
"num_tokens": 3215345.0,
"reward": 2.980521392822266,
"reward_std": 0.05153606534004211,
"rewards/classifier_reward": 0.9805211901664734,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 222.91430053710937,
"epoch": 0.3075,
"grad_norm": 7.509956960673518,
"kl": 0.934375,
"learning_rate": 8.520492350271895e-07,
"loss": 0.001,
"num_tokens": 3241067.0,
"reward": 2.8307112216949464,
"reward_std": 0.1734127746662125,
"rewards/classifier_reward": 0.8521397054195404,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 240.17144165039062,
"epoch": 0.31,
"grad_norm": 6.959659369798759,
"kl": 1.028125,
"learning_rate": 8.490219157935588e-07,
"loss": 0.0015,
"num_tokens": 3267393.0,
"reward": 2.7303539276123048,
"reward_std": 0.1922714289277792,
"rewards/classifier_reward": 0.9517823934555054,
"rewards/length_reward": 0.8000000029802322,
"rewards/slop_reward": 0.9785714268684387,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 229.5428680419922,
"epoch": 0.3125,
"grad_norm": 8.221210886450612,
"kl": 1.91640625,
"learning_rate": 8.459694344887731e-07,
"loss": 0.0019,
"num_tokens": 3293186.0,
"reward": 2.739912986755371,
"reward_std": 0.36319895684719083,
"rewards/classifier_reward": 0.9541985750198364,
"rewards/length_reward": 0.8285714328289032,
"rewards/slop_reward": 0.9571428537368775,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 216.02858276367186,
"epoch": 0.315,
"grad_norm": 6.737006006501442,
"kl": 1.015625,
"learning_rate": 8.428920111756657e-07,
"loss": 0.0015,
"num_tokens": 3318667.0,
"reward": 2.7134992361068724,
"reward_std": 0.2444542996585369,
"rewards/classifier_reward": 0.8563561499118805,
"rewards/length_reward": 0.8571428596973419,
"rewards/slop_reward": 1.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 226.20001220703125,
"epoch": 0.3175,
"grad_norm": 4.36908357763648,
"kl": 0.96640625,
"learning_rate": 8.397898677152172e-07,
"loss": 0.0024,
"num_tokens": 3344503.0,
"reward": 2.9305933475494386,
"reward_std": 0.12042829990386963,
"rewards/classifier_reward": 0.9591646075248719,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 203.31429443359374,
"epoch": 0.32,
"grad_norm": 6.1156992776971855,
"kl": 0.85703125,
"learning_rate": 8.366632277505597e-07,
"loss": 0.0018,
"num_tokens": 3369294.0,
"reward": 2.9620502471923826,
"reward_std": 0.07551092505455018,
"rewards/classifier_reward": 0.983478581905365,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 220.0285858154297,
"epoch": 0.3225,
"grad_norm": 6.8222141729156975,
"kl": 0.85859375,
"learning_rate": 8.335123166908543e-07,
"loss": 0.0013,
"num_tokens": 3394915.0,
"reward": 2.7929779529571532,
"reward_std": 0.28287690281867983,
"rewards/classifier_reward": 0.9340491890907288,
"rewards/length_reward": 0.8857142865657807,
"rewards/slop_reward": 0.9732142806053161,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 225.62857971191406,
"epoch": 0.325,
"grad_norm": 4.536469246397557,
"kl": 1.11015625,
"learning_rate": 8.303373616950406e-07,
"loss": 0.0025,
"num_tokens": 3420549.0,
"reward": 2.9556642055511473,
"reward_std": 0.07710518054664135,
"rewards/classifier_reward": 0.9985211491584778,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9571428537368775,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 220.68572387695312,
"epoch": 0.3275,
"grad_norm": 2.9694599664159407,
"kl": 0.9546875,
"learning_rate": 8.271385916554604e-07,
"loss": 0.0029,
"num_tokens": 3445788.0,
"reward": 2.996094989776611,
"reward_std": 0.010332237184047698,
"rewards/classifier_reward": 0.9960947871208191,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 296.20001831054685,
"epoch": 0.33,
"grad_norm": 4.064839178436649,
"kl": 1.1390625,
"learning_rate": 8.23916237181355e-07,
"loss": 0.0016,
"num_tokens": 3474044.0,
"reward": 2.4268852710723876,
"reward_std": 0.15440489053726197,
"rewards/classifier_reward": 0.9697423577308655,
"rewards/length_reward": 0.4571428596973419,
"rewards/slop_reward": 1.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 282.771435546875,
"epoch": 0.3325,
"grad_norm": 10.750678349276473,
"kl": 0.82734375,
"learning_rate": 8.206705305822412e-07,
"loss": 0.0013,
"num_tokens": 3501861.0,
"reward": 2.4831544876098635,
"reward_std": 0.26082203090190886,
"rewards/classifier_reward": 0.9831543445587159,
"rewards/length_reward": 0.5428571492433548,
"rewards/slop_reward": 0.9571428537368775,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 247.34286804199218,
"epoch": 0.335,
"grad_norm": 106.5245827902456,
"kl": 88.11328125,
"learning_rate": 8.174017058511628e-07,
"loss": 0.0893,
"num_tokens": 3528356.0,
"reward": 2.8546416759490967,
"reward_std": 0.2578580856323242,
"rewards/classifier_reward": 0.9903557300567627,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 0.9785714268684387,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 280.2857299804688,
"epoch": 0.3375,
"grad_norm": 5.075379696681929,
"kl": 1.60859375,
"learning_rate": 8.141099986478212e-07,
"loss": 0.0021,
"num_tokens": 3555922.0,
"reward": 2.635714387893677,
"reward_std": 0.31418272852897644,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.6571428656578064,
"rewards/slop_reward": 0.9785714268684387,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 304.5714416503906,
"epoch": 0.34,
"grad_norm": 5.929984152853479,
"kl": 1.1859375,
"learning_rate": 8.107956462815861e-07,
"loss": 0.0017,
"num_tokens": 3584471.0,
"reward": 2.3495986461639404,
"reward_std": 0.3951677083969116,
"rewards/classifier_reward": 0.8853127479553222,
"rewards/length_reward": 0.4857142955064774,
"rewards/slop_reward": 0.9785714268684387,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 288.2857299804688,
"epoch": 0.3425,
"grad_norm": 8.1455480169376,
"kl": 1.215625,
"learning_rate": 8.074588876943872e-07,
"loss": 0.0012,
"num_tokens": 3612481.0,
"reward": 2.4660715579986574,
"reward_std": 0.5484442114830017,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.5142857283353806,
"rewards/slop_reward": 0.9517857074737549,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 267.8000061035156,
"epoch": 0.345,
"grad_norm": 5.930202214193798,
"kl": 0.98125,
"learning_rate": 8.040999634434882e-07,
"loss": 0.0015,
"num_tokens": 3639774.0,
"reward": 2.7785715579986574,
"reward_std": 0.39145426750183104,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.800000011920929,
"rewards/slop_reward": 0.9785714268684387,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 255.57144165039062,
"epoch": 0.3475,
"grad_norm": 4.399939053025549,
"kl": 1.53125,
"learning_rate": 8.00719115684144e-07,
"loss": 0.0025,
"num_tokens": 3666639.0,
"reward": 2.6642858505249025,
"reward_std": 0.21827136874198913,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.6857142925262452,
"rewards/slop_reward": 0.9785714268684387,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 251.571435546875,
"epoch": 0.35,
"grad_norm": 6.704733780428727,
"kl": 1.0859375,
"learning_rate": 7.973165881521433e-07,
"loss": 0.002,
"num_tokens": 3693159.0,
"reward": 2.8857144355773925,
"reward_std": 0.24877579212188722,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 1.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 248.9428680419922,
"epoch": 0.3525,
"grad_norm": 3.4261047442168397,
"kl": 0.93359375,
"learning_rate": 7.938926261462365e-07,
"loss": 0.0028,
"num_tokens": 3719792.0,
"reward": 2.742554450035095,
"reward_std": 0.16382334232330323,
"rewards/classifier_reward": 0.8568399548530579,
"rewards/length_reward": 0.8857142865657807,
"rewards/slop_reward": 1.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 212.3714385986328,
"epoch": 0.355,
"grad_norm": 6.044238574227886,
"kl": 1.3890625,
"learning_rate": 7.90447476510452e-07,
"loss": 0.0028,
"num_tokens": 3745103.0,
"reward": 2.9194665908813477,
"reward_std": 0.21307192444801332,
"rewards/classifier_reward": 0.9980378150939941,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 0.9785714268684387,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 184.5428680419922,
"epoch": 0.3575,
"grad_norm": 4.2944694392303155,
"kl": 1.140625,
"learning_rate": 7.869813876162998e-07,
"loss": 0.003,
"num_tokens": 3769090.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 207.1428680419922,
"epoch": 0.36,
"grad_norm": 2.4809967074334627,
"kl": 1.1765625,
"learning_rate": 7.834946093448658e-07,
"loss": 0.0031,
"num_tokens": 3794079.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 197.17143859863282,
"epoch": 0.3625,
"grad_norm": 3.755882982422092,
"kl": 1.396875,
"learning_rate": 7.799873930687977e-07,
"loss": 0.0033,
"num_tokens": 3818773.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 187.71429443359375,
"epoch": 0.365,
"grad_norm": 6.482025225350517,
"kl": 2.0046875,
"learning_rate": 7.764599916341816e-07,
"loss": 0.003,
"num_tokens": 3843103.0,
"reward": 2.892857313156128,
"reward_std": 0.22987756729125977,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9142857193946838,
"rewards/slop_reward": 0.9785714268684387,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 214.8857208251953,
"epoch": 0.3675,
"grad_norm": 3.8265113738201806,
"kl": 1.3890625,
"learning_rate": 7.729126593423149e-07,
"loss": 0.0033,
"num_tokens": 3868513.0,
"reward": 2.997367763519287,
"reward_std": 0.006964774429798126,
"rewards/classifier_reward": 0.9973675608634949,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 184.1428649902344,
"epoch": 0.37,
"grad_norm": 18.75869388269992,
"kl": 1.515625,
"learning_rate": 7.693456519313719e-07,
"loss": 0.0029,
"num_tokens": 3892878.0,
"reward": 2.851199245452881,
"reward_std": 0.18016420006752015,
"rewards/classifier_reward": 0.9869133591651916,
"rewards/length_reward": 0.8857142865657807,
"rewards/slop_reward": 0.9785714268684387,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 193.6571502685547,
"epoch": 0.3725,
"grad_norm": 5.454996617575025,
"kl": 1.684375,
"learning_rate": 7.657592265579669e-07,
"loss": 0.0031,
"num_tokens": 3917511.0,
"reward": 2.993258571624756,
"reward_std": 0.01783668529242277,
"rewards/classifier_reward": 0.9932583689689636,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 181.31429138183594,
"epoch": 0.375,
"grad_norm": 3.955698987070301,
"kl": 1.3390625,
"learning_rate": 7.621536417786158e-07,
"loss": 0.0032,
"num_tokens": 3941554.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 183.9714385986328,
"epoch": 0.3775,
"grad_norm": 4.5979098970153185,
"kl": 1.528125,
"learning_rate": 7.585291575310952e-07,
"loss": 0.0034,
"num_tokens": 3965818.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 175.71429443359375,
"epoch": 0.38,
"grad_norm": 7.501271856937242,
"kl": 1.790625,
"learning_rate": 7.548860351157027e-07,
"loss": 0.0027,
"num_tokens": 3989746.0,
"reward": 2.7536909580230713,
"reward_std": 0.3092236161231995,
"rewards/classifier_reward": 0.9251193881034852,
"rewards/length_reward": 0.8285714387893677,
"rewards/slop_reward": 1.0,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 210.571435546875,
"epoch": 0.3825,
"grad_norm": 5.2866175605,
"kl": 2.065625,
"learning_rate": 7.512245371764196e-07,
"loss": 0.0035,
"num_tokens": 4015036.0,
"reward": 2.9245490074157714,
"reward_std": 0.12335940003395081,
"rewards/classifier_reward": 0.9727631211280823,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9517857074737549,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 187.771435546875,
"epoch": 0.385,
"grad_norm": 4.602093297361531,
"kl": 2.203125,
"learning_rate": 7.475449276819752e-07,
"loss": 0.0041,
"num_tokens": 4039528.0,
"reward": 2.942857360839844,
"reward_std": 0.09759000539779664,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 175.60000610351562,
"epoch": 0.3875,
"grad_norm": 6.76968523287945,
"kl": 1.71875,
"learning_rate": 7.438474719068173e-07,
"loss": 0.0031,
"num_tokens": 4063594.0,
"reward": 2.914285898208618,
"reward_std": 0.1731828987598419,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9142857193946838,
"rewards/slop_reward": 1.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 181.31429443359374,
"epoch": 0.39,
"grad_norm": 3.9415238872059795,
"kl": 3.5171875,
"learning_rate": 7.401324364119871e-07,
"loss": 0.0054,
"num_tokens": 4087555.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 199.22858276367188,
"epoch": 0.3925,
"grad_norm": 0.6743528375242909,
"kl": 1.6703125,
"learning_rate": 7.364000890259023e-07,
"loss": 0.0041,
"num_tokens": 4112265.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 203.02857971191406,
"epoch": 0.395,
"grad_norm": 0.21235184389038209,
"kl": 1.5953125,
"learning_rate": 7.326506988250487e-07,
"loss": 0.004,
"num_tokens": 4137291.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 211.00000915527343,
"epoch": 0.3975,
"grad_norm": 0.7871274406295002,
"kl": 1.9828125,
"learning_rate": 7.288845361145812e-07,
"loss": 0.0044,
"num_tokens": 4162596.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 205.82857971191407,
"epoch": 0.4,
"grad_norm": 0.3126714050105658,
"kl": 1.596875,
"learning_rate": 7.251018724088366e-07,
"loss": 0.004,
"num_tokens": 4187048.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 197.42857666015624,
"epoch": 0.4025,
"grad_norm": 0.3330658614195518,
"kl": 1.5875,
"learning_rate": 7.213029804117603e-07,
"loss": 0.004,
"num_tokens": 4211839.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 182.74286499023438,
"epoch": 0.405,
"grad_norm": 0.09053574072038517,
"kl": 1.2125,
"learning_rate": 7.174881339972448e-07,
"loss": 0.0036,
"num_tokens": 4236155.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 189.971435546875,
"epoch": 0.4075,
"grad_norm": 3.922781639240598,
"kl": 1.1859375,
"learning_rate": 7.136576081893863e-07,
"loss": 0.0031,
"num_tokens": 4260724.0,
"reward": 2.968118953704834,
"reward_std": 0.07463454008102417,
"rewards/classifier_reward": 0.9966901540756226,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 211.22857971191405,
"epoch": 0.41,
"grad_norm": 0.5831119895182555,
"kl": 1.22265625,
"learning_rate": 7.09811679142657e-07,
"loss": 0.0036,
"num_tokens": 4285850.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 203.71429748535155,
"epoch": 0.4125,
"grad_norm": 2.3433145018681825,
"kl": 1.1546875,
"learning_rate": 7.059506241219964e-07,
"loss": 0.0031,
"num_tokens": 4310900.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 200.91429443359374,
"epoch": 0.415,
"grad_norm": 0.18814570159680577,
"kl": 1.1671875,
"learning_rate": 7.02074721482822e-07,
"loss": 0.0036,
"num_tokens": 4335852.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 212.08572692871093,
"epoch": 0.4175,
"grad_norm": 3.9253886023797246,
"kl": 1.08125,
"learning_rate": 6.981842506509626e-07,
"loss": 0.0025,
"num_tokens": 4361111.0,
"reward": 2.950000190734863,
"reward_std": 0.13228756189346313,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 202.82857971191407,
"epoch": 0.42,
"grad_norm": 118.43288868650292,
"kl": 36.20390625,
"learning_rate": 6.942794921025126e-07,
"loss": 0.0382,
"num_tokens": 4386130.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 194.0571533203125,
"epoch": 0.4225,
"grad_norm": 0.07954465329541446,
"kl": 1.03984375,
"learning_rate": 6.903607273436127e-07,
"loss": 0.0034,
"num_tokens": 4410840.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 193.7714385986328,
"epoch": 0.425,
"grad_norm": 0.0486408492542655,
"kl": 0.978125,
"learning_rate": 6.864282388901543e-07,
"loss": 0.0034,
"num_tokens": 4435370.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 208.08572692871093,
"epoch": 0.4275,
"grad_norm": 2.6728463524577752,
"kl": 0.92265625,
"learning_rate": 6.824823102474126e-07,
"loss": 0.0028,
"num_tokens": 4460308.0,
"reward": 2.942857360839844,
"reward_std": 0.09759000539779664,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 202.22858276367188,
"epoch": 0.43,
"grad_norm": 0.13449796435482986,
"kl": 1.034375,
"learning_rate": 6.785232258896076e-07,
"loss": 0.0034,
"num_tokens": 4485226.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 182.68572082519532,
"epoch": 0.4325,
"grad_norm": 7.189835012946043,
"kl": 0.95078125,
"learning_rate": 6.745512712393957e-07,
"loss": 0.0024,
"num_tokens": 4509446.0,
"reward": 2.950000190734863,
"reward_std": 0.13228756189346313,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 192.20000915527345,
"epoch": 0.435,
"grad_norm": 3.314581838568204,
"kl": 0.9203125,
"learning_rate": 6.705667326472924e-07,
"loss": 0.0028,
"num_tokens": 4533638.0,
"reward": 2.992787170410156,
"reward_std": 0.019083873927593233,
"rewards/classifier_reward": 0.992786979675293,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 181.7714416503906,
"epoch": 0.4375,
"grad_norm": 4.868845585454781,
"kl": 0.91015625,
"learning_rate": 6.665698973710288e-07,
"loss": 0.0023,
"num_tokens": 4557920.0,
"reward": 2.914285898208618,
"reward_std": 0.1731828987598419,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9142857193946838,
"rewards/slop_reward": 1.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 202.4857208251953,
"epoch": 0.44,
"grad_norm": 5.277192051669204,
"kl": 0.9015625,
"learning_rate": 6.625610535548417e-07,
"loss": 0.0028,
"num_tokens": 4582927.0,
"reward": 2.971197080612183,
"reward_std": 0.07549313902854919,
"rewards/classifier_reward": 0.9997682809829712,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 191.3714385986328,
"epoch": 0.4425,
"grad_norm": 3.5895293665237213,
"kl": 0.86953125,
"learning_rate": 6.58540490208701e-07,
"loss": 0.0028,
"num_tokens": 4607545.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 204.74286804199218,
"epoch": 0.445,
"grad_norm": 3.6924099151163348,
"kl": 0.88046875,
"learning_rate": 6.545084971874736e-07,
"loss": 0.0023,
"num_tokens": 4632631.0,
"reward": 2.9700303077697754,
"reward_std": 0.07929292395710945,
"rewards/classifier_reward": 0.9986015200614929,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 210.74286804199218,
"epoch": 0.4475,
"grad_norm": 4.558365673779022,
"kl": 1.03125,
"learning_rate": 6.504653651700277e-07,
"loss": 0.0025,
"num_tokens": 4657813.0,
"reward": 2.950000190734863,
"reward_std": 0.13228756189346313,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 209.17143859863282,
"epoch": 0.45,
"grad_norm": 0.04420464498654459,
"kl": 0.86640625,
"learning_rate": 6.464113856382751e-07,
"loss": 0.0033,
"num_tokens": 4683054.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 218.4857208251953,
"epoch": 0.4525,
"grad_norm": 2.557396683384517,
"kl": 1.0640625,
"learning_rate": 6.423468508561598e-07,
"loss": 0.003,
"num_tokens": 4708257.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 229.88572387695314,
"epoch": 0.455,
"grad_norm": 3.1120823972576854,
"kl": 1.415625,
"learning_rate": 6.382720538485855e-07,
"loss": 0.0033,
"num_tokens": 4734223.0,
"reward": 2.942857360839844,
"reward_std": 0.09759000539779664,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 226.40000915527344,
"epoch": 0.4575,
"grad_norm": 0.06880695435363977,
"kl": 0.846875,
"learning_rate": 6.341872883802922e-07,
"loss": 0.0032,
"num_tokens": 4759812.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 235.05715637207032,
"epoch": 0.46,
"grad_norm": 2.2167569033565937,
"kl": 0.75546875,
"learning_rate": 6.300928489346765e-07,
"loss": 0.0027,
"num_tokens": 4785935.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 227.91429443359374,
"epoch": 0.4625,
"grad_norm": 2.5442698264193897,
"kl": 0.7765625,
"learning_rate": 6.259890306925626e-07,
"loss": 0.0027,
"num_tokens": 4811832.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 221.2571533203125,
"epoch": 0.465,
"grad_norm": 4.529163471508088,
"kl": 0.79765625,
"learning_rate": 6.218761295109208e-07,
"loss": 0.0018,
"num_tokens": 4837398.0,
"reward": 2.9214287281036375,
"reward_std": 0.2078804552555084,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 0.9785714268684387,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 218.40000915527344,
"epoch": 0.4675,
"grad_norm": 0.04502419697939018,
"kl": 0.8265625,
"learning_rate": 6.177544419015387e-07,
"loss": 0.0032,
"num_tokens": 4862776.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 224.85715026855468,
"epoch": 0.47,
"grad_norm": 2.7870230542981322,
"kl": 0.825,
"learning_rate": 6.13624265009645e-07,
"loss": 0.0027,
"num_tokens": 4888566.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 228.0571502685547,
"epoch": 0.4725,
"grad_norm": 0.06424356320573535,
"kl": 0.8890625,
"learning_rate": 6.094858965924866e-07,
"loss": 0.0033,
"num_tokens": 4914374.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 224.4571533203125,
"epoch": 0.475,
"grad_norm": 4.417906183651794,
"kl": 0.90703125,
"learning_rate": 6.053396349978631e-07,
"loss": 0.0023,
"num_tokens": 4940085.0,
"reward": 2.8559008598327638,
"reward_std": 0.20502071976661682,
"rewards/classifier_reward": 0.9344720721244812,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 0.9785714268684387,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 228.57143859863282,
"epoch": 0.4775,
"grad_norm": 0.7826472563807869,
"kl": 0.95703125,
"learning_rate": 6.011857791426178e-07,
"loss": 0.0033,
"num_tokens": 4966005.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 228.171435546875,
"epoch": 0.48,
"grad_norm": 0.2991397618360992,
"kl": 1.06015625,
"learning_rate": 5.970246284910876e-07,
"loss": 0.0034,
"num_tokens": 4991803.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 245.6285827636719,
"epoch": 0.4825,
"grad_norm": 4.590555458209977,
"kl": 0.915625,
"learning_rate": 5.92856483033514e-07,
"loss": 0.0023,
"num_tokens": 5018320.0,
"reward": 2.950000190734863,
"reward_std": 0.13228756189346313,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 231.00001220703126,
"epoch": 0.485,
"grad_norm": 3.8249513457346622,
"kl": 0.83125,
"learning_rate": 5.886816432644154e-07,
"loss": 0.0023,
"num_tokens": 5044075.0,
"reward": 2.9297013759613035,
"reward_std": 0.12975128293037413,
"rewards/classifier_reward": 0.9797011494636536,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 232.00000915527343,
"epoch": 0.4875,
"grad_norm": 3.4135057358827834,
"kl": 1.3140625,
"learning_rate": 5.845004101609246e-07,
"loss": 0.0032,
"num_tokens": 5069796.0,
"reward": 2.9581347465515138,
"reward_std": 0.10335763692855834,
"rewards/classifier_reward": 0.9867059469223023,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 236.68572692871095,
"epoch": 0.49,
"grad_norm": 4.013642506936478,
"kl": 0.85390625,
"learning_rate": 5.803130851610885e-07,
"loss": 0.0023,
"num_tokens": 5095958.0,
"reward": 2.9428573131561278,
"reward_std": 0.15118578672409058,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 238.4571533203125,
"epoch": 0.4925,
"grad_norm": 4.603644905524406,
"kl": 0.940625,
"learning_rate": 5.761199701421391e-07,
"loss": 0.0019,
"num_tokens": 5121931.0,
"reward": 2.941946840286255,
"reward_std": 0.15359463561326264,
"rewards/classifier_reward": 0.9990895390510559,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 238.02857971191406,
"epoch": 0.495,
"grad_norm": 2.4430881235820507,
"kl": 0.88046875,
"learning_rate": 5.719213673987276e-07,
"loss": 0.0028,
"num_tokens": 5148140.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 228.7714385986328,
"epoch": 0.4975,
"grad_norm": 2.885447633946335,
"kl": 0.8984375,
"learning_rate": 5.677175796211332e-07,
"loss": 0.0028,
"num_tokens": 5173797.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 234.77144470214844,
"epoch": 0.5,
"grad_norm": 0.04186680424978939,
"kl": 0.9296875,
"learning_rate": 5.635089098734393e-07,
"loss": 0.0033,
"num_tokens": 5199798.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 224.51429443359376,
"epoch": 0.5025,
"grad_norm": 2.794225657350946,
"kl": 0.95,
"learning_rate": 5.592956615716866e-07,
"loss": 0.0029,
"num_tokens": 5225576.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 216.02857971191406,
"epoch": 0.505,
"grad_norm": 4.294508436959706,
"kl": 1.10625,
"learning_rate": 5.550781384619973e-07,
"loss": 0.0025,
"num_tokens": 5251038.0,
"reward": 2.975967788696289,
"reward_std": 0.0635837346315384,
"rewards/classifier_reward": 0.9759676098823548,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 218.571435546875,
"epoch": 0.5075,
"grad_norm": 0.043507163632059365,
"kl": 0.91484375,
"learning_rate": 5.50856644598678e-07,
"loss": 0.0033,
"num_tokens": 5276396.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 205.6571502685547,
"epoch": 0.51,
"grad_norm": 0.09961196396275367,
"kl": 1.0578125,
"learning_rate": 5.466314843222993e-07,
"loss": 0.0034,
"num_tokens": 5301460.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 225.9714385986328,
"epoch": 0.5125,
"grad_norm": 3.605683444813084,
"kl": 0.9609375,
"learning_rate": 5.424029622377546e-07,
"loss": 0.0029,
"num_tokens": 5327289.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 215.80001220703124,
"epoch": 0.515,
"grad_norm": 0.19828091837058953,
"kl": 1.23828125,
"learning_rate": 5.381713831923007e-07,
"loss": 0.0036,
"num_tokens": 5352596.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 214.8571563720703,
"epoch": 0.5175,
"grad_norm": 2.9547607246974565,
"kl": 1.12890625,
"learning_rate": 5.339370522535804e-07,
"loss": 0.003,
"num_tokens": 5377938.0,
"reward": 2.992868709564209,
"reward_std": 0.018868234753608704,
"rewards/classifier_reward": 0.9928684830665588,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 252.00001525878906,
"epoch": 0.52,
"grad_norm": 871.3268746641542,
"kl": 64.790625,
"learning_rate": 5.297002746876284e-07,
"loss": 0.0667,
"num_tokens": 5404678.0,
"reward": 2.942857360839844,
"reward_std": 0.09759000539779664,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 271.0857269287109,
"epoch": 0.5225,
"grad_norm": 13961.922953195786,
"kl": 6094.1890625,
"learning_rate": 5.254613559368648e-07,
"loss": 6.1111,
"num_tokens": 5432086.0,
"reward": 2.8857144832611086,
"reward_std": 0.18249738812446595,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 1.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 257.9714385986328,
"epoch": 0.525,
"grad_norm": 8.061516897407943,
"kl": 1.028125,
"learning_rate": 5.212206015980741e-07,
"loss": 0.0025,
"num_tokens": 5459016.0,
"reward": 2.828571605682373,
"reward_std": 0.1731828987598419,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8285714328289032,
"rewards/slop_reward": 1.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 234.3714385986328,
"epoch": 0.5275,
"grad_norm": 0.05294394256827684,
"kl": 1.0125,
"learning_rate": 5.169783174003744e-07,
"loss": 0.0034,
"num_tokens": 5484886.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 244.20001220703125,
"epoch": 0.53,
"grad_norm": 2.7195604219419645,
"kl": 1.353125,
"learning_rate": 5.127348091831755e-07,
"loss": 0.0033,
"num_tokens": 5511353.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 247.80001220703124,
"epoch": 0.5325,
"grad_norm": 2.435046990313359,
"kl": 0.98671875,
"learning_rate": 5.084903828741312e-07,
"loss": 0.0029,
"num_tokens": 5537879.0,
"reward": 2.828571653366089,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8285714298486709,
"rewards/slop_reward": 1.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 233.82858276367188,
"epoch": 0.535,
"grad_norm": 2.848539339582319,
"kl": 1.053125,
"learning_rate": 5.042453444670828e-07,
"loss": 0.003,
"num_tokens": 5563937.0,
"reward": 2.9500002384185793,
"reward_std": 0.0866025447845459,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 243.6285827636719,
"epoch": 0.5375,
"grad_norm": 4.574390608580604,
"kl": 0.91953125,
"learning_rate": 5e-07,
"loss": 0.0024,
"num_tokens": 5590384.0,
"reward": 2.8857144832611086,
"reward_std": 0.19518001079559327,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 1.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 284.0571563720703,
"epoch": 0.54,
"grad_norm": 5.393964971542159,
"kl": 1.16484375,
"learning_rate": 4.957546555329173e-07,
"loss": 0.0016,
"num_tokens": 5618238.0,
"reward": 2.6285715103149414,
"reward_std": 0.25809029340744016,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.6285714328289032,
"rewards/slop_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 244.40001525878907,
"epoch": 0.5425,
"grad_norm": 3.809477712898502,
"kl": 0.7828125,
"learning_rate": 4.915096171258689e-07,
"loss": 0.0022,
"num_tokens": 5644712.0,
"reward": 2.828571605682373,
"reward_std": 0.17318291068077088,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8285714328289032,
"rewards/slop_reward": 1.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 227.60001220703126,
"epoch": 0.545,
"grad_norm": 0.7163836490261896,
"kl": 1.0765625,
"learning_rate": 4.872651908168244e-07,
"loss": 0.0035,
"num_tokens": 5670466.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 210.7714416503906,
"epoch": 0.5475,
"grad_norm": 0.06375443345853048,
"kl": 0.86953125,
"learning_rate": 4.830216825996256e-07,
"loss": 0.0033,
"num_tokens": 5695540.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 212.02858276367186,
"epoch": 0.55,
"grad_norm": 0.044596037608717574,
"kl": 0.8765625,
"learning_rate": 4.787793984019259e-07,
"loss": 0.0033,
"num_tokens": 5720881.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 200.80000915527344,
"epoch": 0.5525,
"grad_norm": 0.16974934473523362,
"kl": 1.16640625,
"learning_rate": 4.7453864406313536e-07,
"loss": 0.0036,
"num_tokens": 5745792.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 211.22857971191405,
"epoch": 0.555,
"grad_norm": 0.0818904708032968,
"kl": 0.9921875,
"learning_rate": 4.7029972531237154e-07,
"loss": 0.0034,
"num_tokens": 5770873.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 195.2571533203125,
"epoch": 0.5575,
"grad_norm": 0.05116820241495938,
"kl": 0.9,
"learning_rate": 4.6606294774641965e-07,
"loss": 0.0033,
"num_tokens": 5795571.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 190.57143859863282,
"epoch": 0.56,
"grad_norm": 3.069339312788395,
"kl": 0.9046875,
"learning_rate": 4.6182861680769923e-07,
"loss": 0.0028,
"num_tokens": 5819962.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 195.08572387695312,
"epoch": 0.5625,
"grad_norm": 2.051249437321434,
"kl": 0.890625,
"learning_rate": 4.5759703776224555e-07,
"loss": 0.0028,
"num_tokens": 5844710.0,
"reward": 2.954781198501587,
"reward_std": 0.0808977723121643,
"rewards/classifier_reward": 0.9833523750305175,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 194.11429748535156,
"epoch": 0.565,
"grad_norm": 0.06947745807380493,
"kl": 0.915625,
"learning_rate": 4.5336851567770074e-07,
"loss": 0.0033,
"num_tokens": 5869322.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 196.68572082519532,
"epoch": 0.5675,
"grad_norm": 3.6616246314488534,
"kl": 0.75234375,
"learning_rate": 4.4914335540132204e-07,
"loss": 0.0027,
"num_tokens": 5893903.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 190.91429443359374,
"epoch": 0.57,
"grad_norm": 2.9722209992367854,
"kl": 2.09140625,
"learning_rate": 4.4492186153800284e-07,
"loss": 0.004,
"num_tokens": 5918505.0,
"reward": 2.942857360839844,
"reward_std": 0.09759000539779664,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 184.91429138183594,
"epoch": 0.5725,
"grad_norm": 0.06042948982632837,
"kl": 0.8359375,
"learning_rate": 4.407043384283136e-07,
"loss": 0.0032,
"num_tokens": 5942897.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 182.25715026855468,
"epoch": 0.575,
"grad_norm": 0.07756768631242807,
"kl": 0.8734375,
"learning_rate": 4.364910901265606e-07,
"loss": 0.0033,
"num_tokens": 5967196.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 173.02857971191406,
"epoch": 0.5775,
"grad_norm": 0.9892808129257086,
"kl": 1.546875,
"learning_rate": 4.3228242037886687e-07,
"loss": 0.0039,
"num_tokens": 5990830.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 162.94286499023437,
"epoch": 0.58,
"grad_norm": 0.0578879268472676,
"kl": 0.97578125,
"learning_rate": 4.280786326012723e-07,
"loss": 0.0034,
"num_tokens": 6014434.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 165.42857666015624,
"epoch": 0.5825,
"grad_norm": 3.3348841449857387,
"kl": 1.1046875,
"learning_rate": 4.23880029857861e-07,
"loss": 0.003,
"num_tokens": 6038089.0,
"reward": 2.914285945892334,
"reward_std": 0.10690449476242066,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9142857193946838,
"rewards/slop_reward": 1.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 163.71429138183595,
"epoch": 0.585,
"grad_norm": 3.485368879973419,
"kl": 1.05703125,
"learning_rate": 4.1968691483891133e-07,
"loss": 0.003,
"num_tokens": 6061739.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 163.00000915527343,
"epoch": 0.5875,
"grad_norm": 5.36230481850721,
"kl": 1.00703125,
"learning_rate": 4.154995898390755e-07,
"loss": 0.002,
"num_tokens": 6085364.0,
"reward": 2.9397803783416747,
"reward_std": 0.15932661443948745,
"rewards/classifier_reward": 0.9969230651855469,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 177.0571502685547,
"epoch": 0.59,
"grad_norm": 3.3342971599613382,
"kl": 1.04140625,
"learning_rate": 4.1131835673558456e-07,
"loss": 0.003,
"num_tokens": 6109257.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 191.42857971191407,
"epoch": 0.5925,
"grad_norm": 0.08376084072008337,
"kl": 1.02109375,
"learning_rate": 4.0714351696648606e-07,
"loss": 0.0034,
"num_tokens": 6133846.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 192.08572692871093,
"epoch": 0.595,
"grad_norm": 0.046627278932137715,
"kl": 0.98515625,
"learning_rate": 4.029753715089123e-07,
"loss": 0.0034,
"num_tokens": 6158489.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 198.08572387695312,
"epoch": 0.5975,
"grad_norm": 11.565806887201926,
"kl": 13.7140625,
"learning_rate": 3.988142208573822e-07,
"loss": 0.0161,
"num_tokens": 6183159.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 197.31429443359374,
"epoch": 0.6,
"grad_norm": 0.05311856298193549,
"kl": 1.02265625,
"learning_rate": 3.94660365002137e-07,
"loss": 0.0034,
"num_tokens": 6207985.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 202.25715026855468,
"epoch": 0.6025,
"grad_norm": 4.782436637657736,
"kl": 1.04921875,
"learning_rate": 3.9051410340751346e-07,
"loss": 0.0025,
"num_tokens": 6232984.0,
"reward": 2.8857144832611086,
"reward_std": 0.1824974000453949,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 1.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 193.6285827636719,
"epoch": 0.605,
"grad_norm": 3.232063907518827,
"kl": 1.15859375,
"learning_rate": 3.8637573499035503e-07,
"loss": 0.0031,
"num_tokens": 6257629.0,
"reward": 2.879447841644287,
"reward_std": 0.11382801532745361,
"rewards/classifier_reward": 0.9937332987785339,
"rewards/length_reward": 0.8857142865657807,
"rewards/slop_reward": 1.0,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 195.4571502685547,
"epoch": 0.6075,
"grad_norm": 0.06471530202926354,
"kl": 1.05625,
"learning_rate": 3.822455580984613e-07,
"loss": 0.0034,
"num_tokens": 6282207.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 274.0571594238281,
"epoch": 0.61,
"grad_norm": 3.0795183504620107,
"kl": 1.50390625,
"learning_rate": 3.781238704890792e-07,
"loss": 0.0034,
"num_tokens": 6309000.0,
"reward": 2.783582401275635,
"reward_std": 0.023816290497779845,
"rewards/classifier_reward": 0.9835822105407714,
"rewards/length_reward": 0.8,
"rewards/slop_reward": 1.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 210.28572692871094,
"epoch": 0.6125,
"grad_norm": 3.0694884073186777,
"kl": 1.0015625,
"learning_rate": 3.7401096930743746e-07,
"loss": 0.0029,
"num_tokens": 6334093.0,
"reward": 2.9978450298309327,
"reward_std": 0.005702095478773117,
"rewards/classifier_reward": 0.9978448033332825,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 212.02857971191406,
"epoch": 0.615,
"grad_norm": 2.5150561307413764,
"kl": 1.084375,
"learning_rate": 3.699071510653235e-07,
"loss": 0.003,
"num_tokens": 6359434.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 220.0571533203125,
"epoch": 0.6175,
"grad_norm": 0.04173992686860701,
"kl": 0.96875,
"learning_rate": 3.6581271161970784e-07,
"loss": 0.0034,
"num_tokens": 6384975.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 221.91429443359374,
"epoch": 0.62,
"grad_norm": 0.11609659945057246,
"kl": 1.05078125,
"learning_rate": 3.6172794615141446e-07,
"loss": 0.0034,
"num_tokens": 6410642.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 229.8571533203125,
"epoch": 0.6225,
"grad_norm": 0.08717380831309861,
"kl": 0.971875,
"learning_rate": 3.5765314914384024e-07,
"loss": 0.0034,
"num_tokens": 6436607.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 235.22858276367188,
"epoch": 0.625,
"grad_norm": 0.06146613518625495,
"kl": 0.99765625,
"learning_rate": 3.535886143617248e-07,
"loss": 0.0034,
"num_tokens": 6462760.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 238.17144165039062,
"epoch": 0.6275,
"grad_norm": 3.705436703551293,
"kl": 1.01953125,
"learning_rate": 3.495346348299724e-07,
"loss": 0.0025,
"num_tokens": 6488563.0,
"reward": 2.9139774799346925,
"reward_std": 0.1073996058665216,
"rewards/classifier_reward": 0.9996915578842163,
"rewards/length_reward": 0.9142857193946838,
"rewards/slop_reward": 1.0,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 240.9714385986328,
"epoch": 0.63,
"grad_norm": 0.04691998567992384,
"kl": 0.97890625,
"learning_rate": 3.454915028125263e-07,
"loss": 0.0034,
"num_tokens": 6514734.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 228.02857971191406,
"epoch": 0.6325,
"grad_norm": 2.988060627858761,
"kl": 1.0296875,
"learning_rate": 3.4145950979129914e-07,
"loss": 0.0029,
"num_tokens": 6540498.0,
"reward": 2.99902081489563,
"reward_std": 0.0025911811739206315,
"rewards/classifier_reward": 0.9990206360816956,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 218.54286499023436,
"epoch": 0.635,
"grad_norm": 0.09098624238326748,
"kl": 1.11640625,
"learning_rate": 3.3743894644515824e-07,
"loss": 0.0035,
"num_tokens": 6565616.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 223.57143859863282,
"epoch": 0.6375,
"grad_norm": 0.07923298438659351,
"kl": 0.95703125,
"learning_rate": 3.334301026289712e-07,
"loss": 0.0033,
"num_tokens": 6591361.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 219.00000610351563,
"epoch": 0.64,
"grad_norm": 3.44595488232409,
"kl": 1.071875,
"learning_rate": 3.294332673527076e-07,
"loss": 0.003,
"num_tokens": 6616850.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 220.80000915527344,
"epoch": 0.6425,
"grad_norm": 0.0704184343563711,
"kl": 1.046875,
"learning_rate": 3.254487287606044e-07,
"loss": 0.0034,
"num_tokens": 6642498.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 235.11429748535156,
"epoch": 0.645,
"grad_norm": 4.4341498316331425,
"kl": 0.9890625,
"learning_rate": 3.214767741103923e-07,
"loss": 0.0024,
"num_tokens": 6668511.0,
"reward": 2.950000190734863,
"reward_std": 0.13228756189346313,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 227.11429443359376,
"epoch": 0.6475,
"grad_norm": 2.667970216501767,
"kl": 1.121875,
"learning_rate": 3.1751768975258743e-07,
"loss": 0.003,
"num_tokens": 6694380.0,
"reward": 2.9997310638427734,
"reward_std": 0.0007120789494365453,
"rewards/classifier_reward": 0.9997308611869812,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 224.42858276367187,
"epoch": 0.65,
"grad_norm": 0.049266434577523735,
"kl": 1.04921875,
"learning_rate": 3.135717611098457e-07,
"loss": 0.0034,
"num_tokens": 6719910.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 220.97144165039063,
"epoch": 0.6525,
"grad_norm": 0.046460428834944396,
"kl": 1.00546875,
"learning_rate": 3.0963927265638734e-07,
"loss": 0.0034,
"num_tokens": 6745328.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 229.2571533203125,
"epoch": 0.655,
"grad_norm": 2.6813194889359324,
"kl": 1.08125,
"learning_rate": 3.0572050789748726e-07,
"loss": 0.003,
"num_tokens": 6771231.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 217.20001220703125,
"epoch": 0.6575,
"grad_norm": 2.8890479947848235,
"kl": 1.16796875,
"learning_rate": 3.018157493490374e-07,
"loss": 0.0031,
"num_tokens": 6796753.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 233.51430053710936,
"epoch": 0.66,
"grad_norm": 2.802128759307218,
"kl": 1.371875,
"learning_rate": 2.9792527851717803e-07,
"loss": 0.0033,
"num_tokens": 6822476.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 225.82858581542968,
"epoch": 0.6625,
"grad_norm": 0.06018626969663984,
"kl": 1.04296875,
"learning_rate": 2.940493758780037e-07,
"loss": 0.0034,
"num_tokens": 6847958.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 214.571435546875,
"epoch": 0.665,
"grad_norm": 57.99972496539001,
"kl": 2.54609375,
"learning_rate": 2.9018832085734295e-07,
"loss": 0.0045,
"num_tokens": 6873054.0,
"reward": 2.8571430683135985,
"reward_std": 0.09759000539779664,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8571428596973419,
"rewards/slop_reward": 1.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 222.40001220703124,
"epoch": 0.6675,
"grad_norm": 5.391908835054465,
"kl": 1.19921875,
"learning_rate": 2.863423918106138e-07,
"loss": 0.0022,
"num_tokens": 6898757.0,
"reward": 2.9923308849334718,
"reward_std": 0.02029096046462655,
"rewards/classifier_reward": 0.9923307299613953,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 226.80001220703124,
"epoch": 0.67,
"grad_norm": 0.44063981643559547,
"kl": 1.27265625,
"learning_rate": 2.825118660027553e-07,
"loss": 0.0037,
"num_tokens": 6924550.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 228.80001220703124,
"epoch": 0.6725,
"grad_norm": 3.710195212366325,
"kl": 1.1890625,
"learning_rate": 2.786970195882398e-07,
"loss": 0.0026,
"num_tokens": 6950478.0,
"reward": 2.9779660224914553,
"reward_std": 0.058296956680715085,
"rewards/classifier_reward": 0.9993943929672241,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 227.51430053710936,
"epoch": 0.675,
"grad_norm": 99.1686641289829,
"kl": 79.6328125,
"learning_rate": 2.748981275911633e-07,
"loss": 0.0819,
"num_tokens": 6976266.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 242.82857971191407,
"epoch": 0.6775,
"grad_norm": 4.778004765539873,
"kl": 1.34140625,
"learning_rate": 2.7111546388541896e-07,
"loss": 0.0028,
"num_tokens": 7002514.0,
"reward": 2.9389439105987547,
"reward_std": 0.10570754185318947,
"rewards/classifier_reward": 0.996086585521698,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 255.74286499023438,
"epoch": 0.68,
"grad_norm": 6.749318308021253,
"kl": 1.54609375,
"learning_rate": 2.673493011749513e-07,
"loss": 0.003,
"num_tokens": 7029221.0,
"reward": 2.6523685693740844,
"reward_std": 0.242851722240448,
"rewards/classifier_reward": 0.9095112562179566,
"rewards/length_reward": 0.7428571462631226,
"rewards/slop_reward": 1.0,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 238.02857971191406,
"epoch": 0.6825,
"grad_norm": 4.816456021644019,
"kl": 1.69375,
"learning_rate": 2.635999109740976e-07,
"loss": 0.0027,
"num_tokens": 7055228.0,
"reward": 2.858464765548706,
"reward_std": 0.17900042831897736,
"rewards/classifier_reward": 0.9941788673400879,
"rewards/length_reward": 0.8857142865657807,
"rewards/slop_reward": 0.9785714268684387,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 241.7714385986328,
"epoch": 0.685,
"grad_norm": 2.4842492940641647,
"kl": 1.0171875,
"learning_rate": 2.598675635880129e-07,
"loss": 0.0029,
"num_tokens": 7081610.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 228.571435546875,
"epoch": 0.6875,
"grad_norm": 2.556011928350466,
"kl": 1.15546875,
"learning_rate": 2.561525280931828e-07,
"loss": 0.0031,
"num_tokens": 7107408.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 231.6285827636719,
"epoch": 0.69,
"grad_norm": 0.08355624448786589,
"kl": 1.053125,
"learning_rate": 2.5245507231802486e-07,
"loss": 0.0034,
"num_tokens": 7133271.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 235.5428680419922,
"epoch": 0.6925,
"grad_norm": 3.2025322276348107,
"kl": 1.4359375,
"learning_rate": 2.487754628235805e-07,
"loss": 0.0033,
"num_tokens": 7159353.0,
"reward": 2.9962107658386232,
"reward_std": 0.01002594456076622,
"rewards/classifier_reward": 0.9962105512619018,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 228.65715942382812,
"epoch": 0.695,
"grad_norm": 0.18581262102933002,
"kl": 1.25859375,
"learning_rate": 2.4511396488429724e-07,
"loss": 0.0036,
"num_tokens": 7185072.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 229.51429748535156,
"epoch": 0.6975,
"grad_norm": 4.453521381818444,
"kl": 0.99765625,
"learning_rate": 2.414708424689048e-07,
"loss": 0.0024,
"num_tokens": 7210683.0,
"reward": 2.882569408416748,
"reward_std": 0.2035010576248169,
"rewards/classifier_reward": 0.9397120952606202,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 223.31429443359374,
"epoch": 0.7,
"grad_norm": 0.1121570381593698,
"kl": 1.2296875,
"learning_rate": 2.378463582213842e-07,
"loss": 0.0036,
"num_tokens": 7236399.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 217.60000915527343,
"epoch": 0.7025,
"grad_norm": 4.495034027794915,
"kl": 0.9515625,
"learning_rate": 2.3424077344203307e-07,
"loss": 0.0024,
"num_tokens": 7261935.0,
"reward": 2.99290018081665,
"reward_std": 0.018784815073013307,
"rewards/classifier_reward": 0.9929000020027161,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 228.11429748535156,
"epoch": 0.705,
"grad_norm": 0.5186142582429601,
"kl": 1.41875,
"learning_rate": 2.3065434806862805e-07,
"loss": 0.0038,
"num_tokens": 7287768.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 228.11429443359376,
"epoch": 0.7075,
"grad_norm": 2.654334342756894,
"kl": 1.0203125,
"learning_rate": 2.2708734065768486e-07,
"loss": 0.0029,
"num_tokens": 7312659.0,
"reward": 2.9993388175964357,
"reward_std": 0.0017499331384897231,
"rewards/classifier_reward": 0.9993385910987854,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 234.40001220703124,
"epoch": 0.71,
"grad_norm": 4.588949960274673,
"kl": 1.12421875,
"learning_rate": 2.2354000836581831e-07,
"loss": 0.0021,
"num_tokens": 7338617.0,
"reward": 2.9408255100250242,
"reward_std": 0.1565615115687251,
"rewards/classifier_reward": 0.9908253073692321,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 231.6285827636719,
"epoch": 0.7125,
"grad_norm": 0.21793328593853228,
"kl": 1.21484375,
"learning_rate": 2.2001260693120232e-07,
"loss": 0.0036,
"num_tokens": 7364198.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 240.08572998046876,
"epoch": 0.715,
"grad_norm": 2.616590063333536,
"kl": 1.01640625,
"learning_rate": 2.1650539065513412e-07,
"loss": 0.0029,
"num_tokens": 7390479.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 241.97144165039063,
"epoch": 0.7175,
"grad_norm": 4.531348675554103,
"kl": 1.19375,
"learning_rate": 2.1301861238370016e-07,
"loss": 0.0031,
"num_tokens": 7416732.0,
"reward": 2.942857360839844,
"reward_std": 0.09759000539779664,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9428571462631226,
"rewards/slop_reward": 1.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 246.20001220703125,
"epoch": 0.72,
"grad_norm": 4.718898706696491,
"kl": 1.49921875,
"learning_rate": 2.0955252348954805e-07,
"loss": 0.0034,
"num_tokens": 7443268.0,
"reward": 2.6439733505249023,
"reward_std": 0.043535226583480836,
"rewards/classifier_reward": 0.8439731419086456,
"rewards/length_reward": 0.8,
"rewards/slop_reward": 1.0,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 224.48572387695313,
"epoch": 0.7225,
"grad_norm": 0.04294906761504897,
"kl": 1.00546875,
"learning_rate": 2.0610737385376348e-07,
"loss": 0.0034,
"num_tokens": 7468990.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 235.77144470214844,
"epoch": 0.725,
"grad_norm": 0.13090449233770926,
"kl": 1.19453125,
"learning_rate": 2.026834118478567e-07,
"loss": 0.0036,
"num_tokens": 7495162.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 222.57143859863282,
"epoch": 0.7275,
"grad_norm": 0.04429936575325668,
"kl": 0.97734375,
"learning_rate": 1.9928088431585589e-07,
"loss": 0.0034,
"num_tokens": 7520868.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 238.7428741455078,
"epoch": 0.73,
"grad_norm": 12.126930606187928,
"kl": 10.5515625,
"learning_rate": 1.959000365565119e-07,
"loss": 0.0129,
"num_tokens": 7546905.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 231.20001220703125,
"epoch": 0.7325,
"grad_norm": 1.1227825049824671,
"kl": 1.8796875,
"learning_rate": 1.925411123056128e-07,
"loss": 0.0043,
"num_tokens": 7572334.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 234.68572387695312,
"epoch": 0.735,
"grad_norm": 2.907894335014808,
"kl": 1.3328125,
"learning_rate": 1.8920435371841392e-07,
"loss": 0.0032,
"num_tokens": 7598444.0,
"reward": 2.996587371826172,
"reward_std": 0.009029625356197358,
"rewards/classifier_reward": 0.9965871214866638,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 232.68572387695312,
"epoch": 0.7375,
"grad_norm": 0.10333954670768886,
"kl": 1.1203125,
"learning_rate": 1.858900013521788e-07,
"loss": 0.0035,
"num_tokens": 7624449.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 233.54286499023436,
"epoch": 0.74,
"grad_norm": 0.058548069484880644,
"kl": 1.05859375,
"learning_rate": 1.8259829414883725e-07,
"loss": 0.0034,
"num_tokens": 7650523.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 235.9428680419922,
"epoch": 0.7425,
"grad_norm": 3.1888591778113993,
"kl": 1.009375,
"learning_rate": 1.7932946941775878e-07,
"loss": 0.0029,
"num_tokens": 7676533.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 235.5428680419922,
"epoch": 0.745,
"grad_norm": 0.10062491596572344,
"kl": 1.0046875,
"learning_rate": 1.7608376281864502e-07,
"loss": 0.0034,
"num_tokens": 7702619.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 244.51430053710936,
"epoch": 0.7475,
"grad_norm": 0.061386216937859915,
"kl": 1.02578125,
"learning_rate": 1.7286140834453954e-07,
"loss": 0.0034,
"num_tokens": 7729097.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 254.4571533203125,
"epoch": 0.75,
"grad_norm": 0.05751560507113229,
"kl": 1.03984375,
"learning_rate": 1.6966263830495935e-07,
"loss": 0.0034,
"num_tokens": 7755641.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 240.34287109375,
"epoch": 0.7525,
"grad_norm": 0.11118968278237154,
"kl": 1.11484375,
"learning_rate": 1.6648768330914576e-07,
"loss": 0.0035,
"num_tokens": 7781895.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 241.40000915527344,
"epoch": 0.755,
"grad_norm": 0.191477055644317,
"kl": 1.0296875,
"learning_rate": 1.6333677224944037e-07,
"loss": 0.0034,
"num_tokens": 7808096.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 256.8571533203125,
"epoch": 0.7575,
"grad_norm": 2.9503904549206177,
"kl": 0.984375,
"learning_rate": 1.6021013228478275e-07,
"loss": 0.0029,
"num_tokens": 7835006.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 246.02857971191406,
"epoch": 0.76,
"grad_norm": 7.0345652170290975,
"kl": 1.221875,
"learning_rate": 1.5710798882433428e-07,
"loss": 0.0036,
"num_tokens": 7861536.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 253.68572387695312,
"epoch": 0.7625,
"grad_norm": 0.0476505587038927,
"kl": 0.96328125,
"learning_rate": 1.5403056551122694e-07,
"loss": 0.0033,
"num_tokens": 7888255.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 247.88572387695314,
"epoch": 0.765,
"grad_norm": 0.04204135010045963,
"kl": 0.89609375,
"learning_rate": 1.5097808420644115e-07,
"loss": 0.0033,
"num_tokens": 7914639.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 255.1428649902344,
"epoch": 0.7675,
"grad_norm": 0.07577358460759685,
"kl": 0.9578125,
"learning_rate": 1.479507649728105e-07,
"loss": 0.0033,
"num_tokens": 7941362.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 249.9714385986328,
"epoch": 0.77,
"grad_norm": 0.04375804621751843,
"kl": 0.95703125,
"learning_rate": 1.4494882605915714e-07,
"loss": 0.0033,
"num_tokens": 7967870.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 262.4571533203125,
"epoch": 0.7725,
"grad_norm": 3.881559830490846,
"kl": 1.06953125,
"learning_rate": 1.419724838845569e-07,
"loss": 0.0025,
"num_tokens": 7994976.0,
"reward": 2.7243717670440675,
"reward_std": 0.2547113478183746,
"rewards/classifier_reward": 0.9529429793357849,
"rewards/length_reward": 0.7714285761117935,
"rewards/slop_reward": 1.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 256.0857208251953,
"epoch": 0.775,
"grad_norm": 0.042854049785843215,
"kl": 0.934375,
"learning_rate": 1.3902195302273778e-07,
"loss": 0.0033,
"num_tokens": 8021851.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 270.20001525878905,
"epoch": 0.7775,
"grad_norm": 5.120776391757703,
"kl": 0.9703125,
"learning_rate": 1.3609744618661013e-07,
"loss": 0.0019,
"num_tokens": 8049101.0,
"reward": 2.8000001430511476,
"reward_std": 0.28008740544319155,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8000000059604645,
"rewards/slop_reward": 1.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 266.88572998046874,
"epoch": 0.78,
"grad_norm": 5.825629703539792,
"kl": 1.01640625,
"learning_rate": 1.331991742129318e-07,
"loss": 0.0024,
"num_tokens": 8076201.0,
"reward": 2.828571605682373,
"reward_std": 0.1731828987598419,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8285714328289032,
"rewards/slop_reward": 1.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 255.80001220703124,
"epoch": 0.7825,
"grad_norm": 2.2750579624967204,
"kl": 0.90625,
"learning_rate": 1.3032734604710783e-07,
"loss": 0.0028,
"num_tokens": 8102845.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 258.34287109375,
"epoch": 0.785,
"grad_norm": 2.9657377944921426,
"kl": 1.23046875,
"learning_rate": 1.2748216872812745e-07,
"loss": 0.0031,
"num_tokens": 8129806.0,
"reward": 2.828571653366089,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8285714298486709,
"rewards/slop_reward": 1.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 241.60000915527343,
"epoch": 0.7875,
"grad_norm": 0.04358713190283898,
"kl": 0.9265625,
"learning_rate": 1.2466384737363779e-07,
"loss": 0.0033,
"num_tokens": 8156161.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 252.94286499023437,
"epoch": 0.79,
"grad_norm": 0.05992737661065536,
"kl": 0.98046875,
"learning_rate": 1.2187258516515642e-07,
"loss": 0.0034,
"num_tokens": 8182699.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 247.91429443359374,
"epoch": 0.7925,
"grad_norm": 0.07122711325744925,
"kl": 0.9984375,
"learning_rate": 1.1910858333342277e-07,
"loss": 0.0034,
"num_tokens": 8209296.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 250.11429748535156,
"epoch": 0.795,
"grad_norm": 3.4982043756301584,
"kl": 0.98671875,
"learning_rate": 1.1637204114389177e-07,
"loss": 0.0029,
"num_tokens": 8235818.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 251.08572387695312,
"epoch": 0.7975,
"grad_norm": 0.04511891765892185,
"kl": 0.93671875,
"learning_rate": 1.1366315588236741e-07,
"loss": 0.0033,
"num_tokens": 8262480.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 252.4571563720703,
"epoch": 0.8,
"grad_norm": 4.101920254520037,
"kl": 0.98984375,
"learning_rate": 1.1098212284078035e-07,
"loss": 0.0024,
"num_tokens": 8289236.0,
"reward": 2.8857144832611086,
"reward_std": 0.19518001079559327,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.8857142925262451,
"rewards/slop_reward": 1.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 248.91429748535157,
"epoch": 0.8025,
"grad_norm": 0.05706184089424821,
"kl": 0.975,
"learning_rate": 1.0832913530310783e-07,
"loss": 0.0034,
"num_tokens": 8315716.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 250.31430358886718,
"epoch": 0.805,
"grad_norm": 0.07917877336969092,
"kl": 1.02265625,
"learning_rate": 1.0570438453144043e-07,
"loss": 0.0034,
"num_tokens": 8342093.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 246.60001220703126,
"epoch": 0.8075,
"grad_norm": 0.09776954891266137,
"kl": 1.1421875,
"learning_rate": 1.0310805975219255e-07,
"loss": 0.0035,
"num_tokens": 8368479.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 241.85715026855468,
"epoch": 0.81,
"grad_norm": 0.05012984603712916,
"kl": 0.96015625,
"learning_rate": 1.0054034814246093e-07,
"loss": 0.0033,
"num_tokens": 8394862.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 244.7714385986328,
"epoch": 0.8125,
"grad_norm": 0.2165819917911517,
"kl": 1.35078125,
"learning_rate": 9.800143481652979e-08,
"loss": 0.0037,
"num_tokens": 8421276.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 246.08572998046876,
"epoch": 0.815,
"grad_norm": 0.20309822089295643,
"kl": 1.27265625,
"learning_rate": 9.549150281252632e-08,
"loss": 0.0037,
"num_tokens": 8447767.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 243.6571533203125,
"epoch": 0.8175,
"grad_norm": 0.15578036977279336,
"kl": 1.1515625,
"learning_rate": 9.30107330792243e-08,
"loss": 0.0035,
"num_tokens": 8474150.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 237.4857208251953,
"epoch": 0.82,
"grad_norm": 0.043416576178703016,
"kl": 0.9265625,
"learning_rate": 9.055930446299914e-08,
"loss": 0.0033,
"num_tokens": 8500171.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 234.57143859863282,
"epoch": 0.8225,
"grad_norm": 0.05061080629519643,
"kl": 1.034375,
"learning_rate": 8.813739369493395e-08,
"loss": 0.0034,
"num_tokens": 8526178.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 243.28572692871094,
"epoch": 0.825,
"grad_norm": 0.04111158267935015,
"kl": 0.94296875,
"learning_rate": 8.574517537807896e-08,
"loss": 0.0033,
"num_tokens": 8552519.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 242.00000915527343,
"epoch": 0.8275,
"grad_norm": 0.3393008711951354,
"kl": 1.38515625,
"learning_rate": 8.338282197486362e-08,
"loss": 0.0038,
"num_tokens": 8578538.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 249.71430053710938,
"epoch": 0.83,
"grad_norm": 5.920098814946804,
"kl": 1.471875,
"learning_rate": 8.105050379466332e-08,
"loss": 0.0034,
"num_tokens": 8604935.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 249.34286499023438,
"epoch": 0.8325,
"grad_norm": 2.5733896209901923,
"kl": 1.1671875,
"learning_rate": 7.87483889815207e-08,
"loss": 0.0031,
"num_tokens": 8631504.0,
"reward": 2.991787624359131,
"reward_std": 0.02172858864068985,
"rewards/classifier_reward": 0.9917873620986939,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 237.14287109375,
"epoch": 0.835,
"grad_norm": 2.715010668750814,
"kl": 0.91875,
"learning_rate": 7.64766435020246e-08,
"loss": 0.0028,
"num_tokens": 8657724.0,
"reward": 2.989917850494385,
"reward_std": 0.026675373315811157,
"rewards/classifier_reward": 0.9899176597595215,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 247.05715637207032,
"epoch": 0.8375,
"grad_norm": 2.9706039903092303,
"kl": 0.96171875,
"learning_rate": 7.423543113334435e-08,
"loss": 0.0029,
"num_tokens": 8684291.0,
"reward": 2.998081636428833,
"reward_std": 0.005076154321432114,
"rewards/classifier_reward": 0.9980813980102539,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 251.02857971191406,
"epoch": 0.84,
"grad_norm": 3.791827437250973,
"kl": 0.978125,
"learning_rate": 7.202491345142286e-08,
"loss": 0.0029,
"num_tokens": 8710997.0,
"reward": 2.6675583362579345,
"reward_std": 0.1484653353691101,
"rewards/classifier_reward": 0.8389866888523102,
"rewards/length_reward": 0.8285714298486709,
"rewards/slop_reward": 1.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 241.48572692871093,
"epoch": 0.8425,
"grad_norm": 0.26642623759823414,
"kl": 1.29609375,
"learning_rate": 6.984524981932755e-08,
"loss": 0.0037,
"num_tokens": 8736817.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 247.2571533203125,
"epoch": 0.845,
"grad_norm": 0.05826256029366563,
"kl": 0.9796875,
"learning_rate": 6.769659737576227e-08,
"loss": 0.0034,
"num_tokens": 8763338.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 247.00001525878906,
"epoch": 0.8475,
"grad_norm": 0.0820068561247779,
"kl": 0.91484375,
"learning_rate": 6.557911102373809e-08,
"loss": 0.0033,
"num_tokens": 8789832.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 241.57143859863282,
"epoch": 0.85,
"grad_norm": 0.145851308869347,
"kl": 1.04375,
"learning_rate": 6.349294341940592e-08,
"loss": 0.0034,
"num_tokens": 8816056.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 249.6571533203125,
"epoch": 0.8525,
"grad_norm": 4.893160591979278,
"kl": 1.1859375,
"learning_rate": 6.143824496105121e-08,
"loss": 0.0031,
"num_tokens": 8842714.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 245.80001220703124,
"epoch": 0.855,
"grad_norm": 0.06508578960217941,
"kl": 1.04375,
"learning_rate": 5.941516377825101e-08,
"loss": 0.0034,
"num_tokens": 8869237.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 249.80000915527344,
"epoch": 0.8575,
"grad_norm": 4.184670329516298,
"kl": 0.9828125,
"learning_rate": 5.7423845721195184e-08,
"loss": 0.0024,
"num_tokens": 8895106.0,
"reward": 2.950000190734863,
"reward_std": 0.13228756189346313,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 0.9785714268684387,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 247.22858276367188,
"epoch": 0.86,
"grad_norm": 0.04621548010483809,
"kl": 0.96796875,
"learning_rate": 5.546443435017145e-08,
"loss": 0.0034,
"num_tokens": 8921496.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 243.71429443359375,
"epoch": 0.8625,
"grad_norm": 3.0589783495407725,
"kl": 0.93515625,
"learning_rate": 5.353707092521581e-08,
"loss": 0.0028,
"num_tokens": 8947905.0,
"reward": 2.994158411026001,
"reward_std": 0.01545594185590744,
"rewards/classifier_reward": 0.9941581964492798,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 244.94286499023437,
"epoch": 0.865,
"grad_norm": 5.554678417760662,
"kl": 3.01484375,
"learning_rate": 5.16418943959282e-08,
"loss": 0.0054,
"num_tokens": 8974370.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 244.7714416503906,
"epoch": 0.8675,
"grad_norm": 0.0645376640043755,
"kl": 0.915625,
"learning_rate": 4.9779041391455775e-08,
"loss": 0.0033,
"num_tokens": 9000853.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 245.8571563720703,
"epoch": 0.87,
"grad_norm": 0.1578429426456495,
"kl": 1.0453125,
"learning_rate": 4.794864621064265e-08,
"loss": 0.0034,
"num_tokens": 9027326.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 243.51429748535156,
"epoch": 0.8725,
"grad_norm": 0.09635841303768418,
"kl": 1.05234375,
"learning_rate": 4.615084081234799e-08,
"loss": 0.0034,
"num_tokens": 9053438.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 238.80000915527344,
"epoch": 0.875,
"grad_norm": 9.025341176830713,
"kl": 0.93828125,
"learning_rate": 4.4385754805932095e-08,
"loss": 0.0024,
"num_tokens": 9079245.0,
"reward": 2.9694100856781005,
"reward_std": 0.0809337928891182,
"rewards/classifier_reward": 0.9908384680747986,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 249.28572692871094,
"epoch": 0.8775,
"grad_norm": 3.7703349335107053,
"kl": 1.09296875,
"learning_rate": 4.2653515441913646e-08,
"loss": 0.003,
"num_tokens": 9105890.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 229.60000915527343,
"epoch": 0.88,
"grad_norm": 4.44121260340152,
"kl": 0.9359375,
"learning_rate": 4.095424760279453e-08,
"loss": 0.0024,
"num_tokens": 9131259.0,
"reward": 2.8642487049102785,
"reward_std": 0.13740314245224,
"rewards/classifier_reward": 0.8642484605312347,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 238.171435546875,
"epoch": 0.8825,
"grad_norm": 4.315109786774216,
"kl": 1.1671875,
"learning_rate": 3.928807379405763e-08,
"loss": 0.0026,
"num_tokens": 9157515.0,
"reward": 2.804906415939331,
"reward_std": 0.016342369094491004,
"rewards/classifier_reward": 0.8049062207341194,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 239.2571563720703,
"epoch": 0.885,
"grad_norm": 2.5953487008425773,
"kl": 0.98125,
"learning_rate": 3.7655114135334284e-08,
"loss": 0.0029,
"num_tokens": 9183753.0,
"reward": 2.9993186473846434,
"reward_std": 0.0018032947555184364,
"rewards/classifier_reward": 0.9993184208869934,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 243.91429748535157,
"epoch": 0.8875,
"grad_norm": 3.061039239899815,
"kl": 0.87109375,
"learning_rate": 3.6055486351745324e-08,
"loss": 0.0028,
"num_tokens": 9210003.0,
"reward": 2.9983937740325928,
"reward_std": 0.004250280186533928,
"rewards/classifier_reward": 0.9983935475349426,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 248.65715637207032,
"epoch": 0.89,
"grad_norm": 0.07280216811639449,
"kl": 1.0234375,
"learning_rate": 3.448930576541309e-08,
"loss": 0.0034,
"num_tokens": 9236471.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 244.571435546875,
"epoch": 0.8925,
"grad_norm": 0.04031184293990925,
"kl": 0.85625,
"learning_rate": 3.295668528714801e-08,
"loss": 0.0032,
"num_tokens": 9262896.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 247.0571533203125,
"epoch": 0.895,
"grad_norm": 0.048980473886171605,
"kl": 0.90390625,
"learning_rate": 3.145773540830815e-08,
"loss": 0.0033,
"num_tokens": 9289145.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 247.48572692871093,
"epoch": 0.8975,
"grad_norm": 0.06968257994497579,
"kl": 0.9921875,
"learning_rate": 2.9992564192834246e-08,
"loss": 0.0034,
"num_tokens": 9315727.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 245.34286804199218,
"epoch": 0.9,
"grad_norm": 0.05048579158042856,
"kl": 0.965625,
"learning_rate": 2.8561277269457895e-08,
"loss": 0.0034,
"num_tokens": 9342091.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 240.94287109375,
"epoch": 0.9025,
"grad_norm": 0.07338683112545501,
"kl": 0.9578125,
"learning_rate": 2.7163977824087692e-08,
"loss": 0.0033,
"num_tokens": 9368444.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 245.48572387695313,
"epoch": 0.905,
"grad_norm": 2.0759195339474985,
"kl": 2.634375,
"learning_rate": 2.5800766592369073e-08,
"loss": 0.005,
"num_tokens": 9394956.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 243.1428649902344,
"epoch": 0.9075,
"grad_norm": 0.071979853158223,
"kl": 0.9921875,
"learning_rate": 2.4471741852423233e-08,
"loss": 0.0034,
"num_tokens": 9421111.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 247.60001525878906,
"epoch": 0.91,
"grad_norm": 4.112340756036376,
"kl": 1.06640625,
"learning_rate": 2.3176999417760633e-08,
"loss": 0.003,
"num_tokens": 9447147.0,
"reward": 2.997753620147705,
"reward_std": 0.005943871289491654,
"rewards/classifier_reward": 0.9977534294128418,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 247.65715637207032,
"epoch": 0.9125,
"grad_norm": 0.1514075169088936,
"kl": 1.1046875,
"learning_rate": 2.1916632630374577e-08,
"loss": 0.0035,
"num_tokens": 9473085.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 240.74286804199218,
"epoch": 0.915,
"grad_norm": 0.18737733575482768,
"kl": 1.07578125,
"learning_rate": 2.0690732354011088e-08,
"loss": 0.0035,
"num_tokens": 9499431.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 235.6571533203125,
"epoch": 0.9175,
"grad_norm": 0.04304702113082608,
"kl": 0.92421875,
"learning_rate": 1.9499386967619104e-08,
"loss": 0.0033,
"num_tokens": 9525433.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 234.6571502685547,
"epoch": 0.92,
"grad_norm": 2.545151028403328,
"kl": 0.9734375,
"learning_rate": 1.8342682358978068e-08,
"loss": 0.0029,
"num_tokens": 9551354.0,
"reward": 2.9982373237609865,
"reward_std": 0.004664153978228569,
"rewards/classifier_reward": 0.9982371211051941,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 244.11429443359376,
"epoch": 0.9225,
"grad_norm": 0.04345300022978855,
"kl": 0.9125,
"learning_rate": 1.7220701918506662e-08,
"loss": 0.0033,
"num_tokens": 9577818.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 247.74287109375,
"epoch": 0.925,
"grad_norm": 3.5704547668159674,
"kl": 1.284375,
"learning_rate": 1.6133526533250563e-08,
"loss": 0.0032,
"num_tokens": 9604325.0,
"reward": 2.9714287757873534,
"reward_std": 0.07559289336204529,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 0.9714285731315613,
"rewards/slop_reward": 1.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 237.11429748535156,
"epoch": 0.9275,
"grad_norm": 0.041173535714356384,
"kl": 0.88828125,
"learning_rate": 1.5081234581051482e-08,
"loss": 0.0033,
"num_tokens": 9630168.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 244.8571533203125,
"epoch": 0.93,
"grad_norm": 0.08243253183651265,
"kl": 0.934375,
"learning_rate": 1.4063901924895982e-08,
"loss": 0.0033,
"num_tokens": 9656156.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 237.1428649902344,
"epoch": 0.9325,
"grad_norm": 0.05082406017565479,
"kl": 0.94921875,
"learning_rate": 1.3081601907447004e-08,
"loss": 0.0033,
"num_tokens": 9682368.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 242.80001220703124,
"epoch": 0.935,
"grad_norm": 2.8407093679191733,
"kl": 1.04453125,
"learning_rate": 1.2134405345755772e-08,
"loss": 0.003,
"num_tokens": 9708489.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 248.08572692871093,
"epoch": 0.9375,
"grad_norm": 0.14511834175077776,
"kl": 1.07421875,
"learning_rate": 1.1222380526156927e-08,
"loss": 0.0035,
"num_tokens": 9734849.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 241.48572692871093,
"epoch": 0.94,
"grad_norm": 0.042968063146662946,
"kl": 0.91875,
"learning_rate": 1.034559319934497e-08,
"loss": 0.0033,
"num_tokens": 9761221.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 235.17144165039062,
"epoch": 0.9425,
"grad_norm": 3.168647847961566,
"kl": 1.0890625,
"learning_rate": 9.504106575634663e-09,
"loss": 0.003,
"num_tokens": 9787372.0,
"reward": 2.965010404586792,
"reward_std": 0.06012881994247436,
"rewards/classifier_reward": 0.9864387512207031,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 237.91429748535157,
"epoch": 0.945,
"grad_norm": 0.04526250685823323,
"kl": 0.99375,
"learning_rate": 8.697981320403336e-09,
"loss": 0.0034,
"num_tokens": 9813619.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 246.20001220703125,
"epoch": 0.9475,
"grad_norm": 4.273461002843142,
"kl": 0.9484375,
"learning_rate": 7.927275549718226e-09,
"loss": 0.0019,
"num_tokens": 9839751.0,
"reward": 2.9877804279327393,
"reward_std": 0.03233038559556008,
"rewards/classifier_reward": 0.9877802491188049,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 247.8571533203125,
"epoch": 0.95,
"grad_norm": 0.20002216578085852,
"kl": 1.165625,
"learning_rate": 7.1920448261457715e-09,
"loss": 0.0035,
"num_tokens": 9866244.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 244.60000915527343,
"epoch": 0.9525,
"grad_norm": 0.0834845425154776,
"kl": 1.04375,
"learning_rate": 6.492342154746588e-09,
"loss": 0.0034,
"num_tokens": 9892505.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 237.31429748535157,
"epoch": 0.955,
"grad_norm": 2.52753839477545,
"kl": 1.11171875,
"learning_rate": 5.828217979253869e-09,
"loss": 0.003,
"num_tokens": 9918307.0,
"reward": 2.9943798542022706,
"reward_std": 0.014869998395442962,
"rewards/classifier_reward": 0.9943796753883362,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 235.571435546875,
"epoch": 0.9575,
"grad_norm": 3.128240693093393,
"kl": 0.98359375,
"learning_rate": 5.1997201784368395e-09,
"loss": 0.0029,
"num_tokens": 9944374.0,
"reward": 2.995487594604492,
"reward_std": 0.011939284205436707,
"rewards/classifier_reward": 0.995487380027771,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 240.9428680419922,
"epoch": 0.96,
"grad_norm": 3.5538024950839295,
"kl": 1.0,
"learning_rate": 4.606894062648969e-09,
"loss": 0.0024,
"num_tokens": 9970727.0,
"reward": 2.977288818359375,
"reward_std": 0.06008877456188202,
"rewards/classifier_reward": 0.9987171411514282,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 240.6571502685547,
"epoch": 0.9625,
"grad_norm": 0.1991534915494115,
"kl": 1.184375,
"learning_rate": 4.049782370561583e-09,
"loss": 0.0036,
"num_tokens": 9997070.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 235.34286804199218,
"epoch": 0.965,
"grad_norm": 0.05753950305238813,
"kl": 0.99375,
"learning_rate": 3.5284252660823244e-09,
"loss": 0.0034,
"num_tokens": 10023227.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 237.6571533203125,
"epoch": 0.9675,
"grad_norm": 0.09059748000607612,
"kl": 1.003125,
"learning_rate": 3.0428603354600844e-09,
"loss": 0.0034,
"num_tokens": 10049465.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 254.54286499023436,
"epoch": 0.97,
"grad_norm": 0.12475756025548783,
"kl": 1.07890625,
"learning_rate": 2.5931225845748917e-09,
"loss": 0.0035,
"num_tokens": 10076294.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 242.6571533203125,
"epoch": 0.9725,
"grad_norm": 0.4783333308376495,
"kl": 1.3875,
"learning_rate": 2.1792444364144847e-09,
"loss": 0.0038,
"num_tokens": 10102419.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 241.34286804199218,
"epoch": 0.975,
"grad_norm": 0.08716408529439204,
"kl": 0.93125,
"learning_rate": 1.8012557287367391e-09,
"loss": 0.0033,
"num_tokens": 10128786.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 241.00000915527343,
"epoch": 0.9775,
"grad_norm": 3.1645439252715826,
"kl": 0.9140625,
"learning_rate": 1.4591837119186102e-09,
"loss": 0.0028,
"num_tokens": 10155141.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 235.28572692871094,
"epoch": 0.98,
"grad_norm": 0.41518239333595675,
"kl": 1.1234375,
"learning_rate": 1.1530530469914256e-09,
"loss": 0.0035,
"num_tokens": 10181268.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 243.7714385986328,
"epoch": 0.9825,
"grad_norm": 0.040861855208901524,
"kl": 0.878125,
"learning_rate": 8.828858038632536e-10,
"loss": 0.0033,
"num_tokens": 10207349.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 249.9714385986328,
"epoch": 0.985,
"grad_norm": 0.12170276230850544,
"kl": 1.1421875,
"learning_rate": 6.48701459727563e-10,
"loss": 0.0035,
"num_tokens": 10233544.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 243.00001220703126,
"epoch": 0.9875,
"grad_norm": 0.04242721259201106,
"kl": 0.93828125,
"learning_rate": 4.5051689765929213e-10,
"loss": 0.0033,
"num_tokens": 10259969.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 248.68572387695312,
"epoch": 0.99,
"grad_norm": 2.430119389458536,
"kl": 0.96875,
"learning_rate": 2.883464053973772e-10,
"loss": 0.0029,
"num_tokens": 10286476.0,
"reward": 2.9996359825134276,
"reward_std": 0.0009637950919568538,
"rewards/classifier_reward": 0.9996357202529907,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 244.34286499023438,
"epoch": 0.9925,
"grad_norm": 2.4946704769049948,
"kl": 1.090625,
"learning_rate": 1.6220167431502118e-10,
"loss": 0.003,
"num_tokens": 10312797.0,
"reward": 2.998571014404297,
"reward_std": 0.0037812769412994387,
"rewards/classifier_reward": 0.9985708117485046,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 240.60001220703126,
"epoch": 0.995,
"grad_norm": 0.057929252615434516,
"kl": 0.93359375,
"learning_rate": 7.209179857675663e-11,
"loss": 0.0033,
"num_tokens": 10338970.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 238.6285827636719,
"epoch": 0.9975,
"grad_norm": 0.06529828900649211,
"kl": 1.0125,
"learning_rate": 1.8023274482636965e-11,
"loss": 0.0034,
"num_tokens": 10365242.0,
"reward": 3.000000238418579,
"reward_std": 0.0,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 1.0,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 239.8800018310547,
"epoch": 1.0,
"grad_norm": 2.484762868251253,
"kl": 0.875,
"learning_rate": 0.0,
"loss": 0.0028,
"num_tokens": 10384026.0,
"reward": 2.978571653366089,
"reward_std": 0.056694668531417844,
"rewards/classifier_reward": 1.0,
"rewards/length_reward": 1.0,
"rewards/slop_reward": 0.9785714268684387,
"step": 400
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}