{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1000.0000610351562, "epoch": 0.0025, "grad_norm": 0.2079561913831216, "kl": 0.0158782958984375, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "num_tokens": 52920.0, "reward": 2.0, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.0, "rewards/slop_reward": 1.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 179.22857971191405, "epoch": 0.005, "grad_norm": 12.249017739002824, "kl": 0.00041656494140625, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "num_tokens": 77113.0, "reward": 1.8515485525131226, "reward_std": 0.6024735510349274, "rewards/classifier_reward": 0.37967347651720046, "rewards/length_reward": 0.7142857313156128, "rewards/slop_reward": 0.7575892865657806, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 133.4857208251953, "epoch": 0.0075, "grad_norm": 10089.085994752151, "kl": 0.001357269287109375, "learning_rate": 1e-07, "loss": 0.0, "num_tokens": 99305.0, "reward": 1.5958443641662599, "reward_std": 0.44231254458427427, "rewards/classifier_reward": 0.4083442732691765, "rewards/length_reward": 0.3428571462631226, "rewards/slop_reward": 0.8446428537368774, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 215.08572387695312, "epoch": 0.01, "grad_norm": 2054.9417817778553, "kl": 0.0014644622802734374, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "num_tokens": 124753.0, "reward": 1.2784200072288514, "reward_std": 0.6290957629680634, "rewards/classifier_reward": 0.1494020951911807, "rewards/length_reward": 0.4571428656578064, "rewards/slop_reward": 0.6718749940395355, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 173.74286346435548, "epoch": 0.0125, "grad_norm": 2266.5233619504957, "kl": 0.001641082763671875, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "num_tokens": 148708.0, "reward": 1.3791944861412049, "reward_std": 0.43466432094573976, "rewards/classifier_reward": 0.17383726984262465, "rewards/length_reward": 0.5428571462631225, "rewards/slop_reward": 0.6625, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 202.02857971191406, "epoch": 0.015, "grad_norm": 11.643315762932316, "kl": 0.0005245208740234375, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 173615.0, "reward": 2.2288811445236205, "reward_std": 0.4513823240995407, "rewards/classifier_reward": 0.4342381663620472, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 0.9089285731315613, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 213.74286499023438, "epoch": 0.0175, "grad_norm": 13.650814377669256, "kl": 0.000527191162109375, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "num_tokens": 199016.0, "reward": 1.9626029968261718, "reward_std": 0.3459654450416565, "rewards/classifier_reward": 0.39831727296113967, "rewards/length_reward": 0.7142857164144516, "rewards/slop_reward": 0.8499999940395355, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 288.71429443359375, "epoch": 0.02, "grad_norm": 10.936857808142296, "kl": 0.0004474639892578125, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "num_tokens": 227041.0, "reward": 1.8008938789367677, "reward_std": 0.3342688336968422, "rewards/classifier_reward": 0.4794651668518782, "rewards/length_reward": 0.4285714328289032, "rewards/slop_reward": 0.8928571343421936, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 135.14286193847656, "epoch": 0.0225, "grad_norm": 13.869838227975858, "kl": 0.000525665283203125, "learning_rate": 3e-07, "loss": 0.0005, "num_tokens": 249691.0, "reward": 1.9354748964309691, "reward_std": 0.3361890375614166, "rewards/classifier_reward": 0.5604747980833054, "rewards/length_reward": 0.5142857193946838, "rewards/slop_reward": 0.8607142925262451, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 208.91429748535157, "epoch": 0.025, "grad_norm": 12.652344276830467, "kl": 0.0005481719970703125, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 274843.0, "reward": 1.943293523788452, "reward_std": 0.6581979870796204, "rewards/classifier_reward": 0.5843648463487625, "rewards/length_reward": 0.6000000178813935, "rewards/slop_reward": 0.7589285731315613, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 205.71429443359375, "epoch": 0.0275, "grad_norm": 15.071895621435214, "kl": 0.00064544677734375, "learning_rate": 3.666666666666666e-07, "loss": 0.0, "num_tokens": 299820.0, "reward": 1.5763698101043702, "reward_std": 0.4757842034101486, "rewards/classifier_reward": 0.431726861000061, "rewards/length_reward": 0.25714286863803865, "rewards/slop_reward": 0.8875, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 230.4285858154297, "epoch": 0.03, "grad_norm": 190.50276677565853, "kl": 0.014757537841796875, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 325730.0, "reward": 2.0366717338562013, "reward_std": 0.44329026341438293, "rewards/classifier_reward": 0.3331002712249756, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 0.8178571462631226, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 205.62857971191406, "epoch": 0.0325, "grad_norm": 12.86535673011762, "kl": 0.0007328033447265625, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "num_tokens": 350634.0, "reward": 1.3644737243652343, "reward_std": 0.5067215681076049, "rewards/classifier_reward": 0.29661653861403464, "rewards/length_reward": 0.31428571939468386, "rewards/slop_reward": 0.7535714268684387, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 149.25715026855468, "epoch": 0.035, "grad_norm": 13.332730561008187, "kl": 0.00087738037109375, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "num_tokens": 373261.0, "reward": 1.4552783489227294, "reward_std": 0.6117016971111298, "rewards/classifier_reward": 0.18384971991181373, "rewards/length_reward": 0.40000001192092893, "rewards/slop_reward": 0.8714285612106323, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 213.71429901123048, "epoch": 0.0375, "grad_norm": 12.15928533039066, "kl": 0.001068878173828125, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 398319.0, "reward": 1.7259351372718812, "reward_std": 0.5805591940879822, "rewards/classifier_reward": 0.5187922030687332, "rewards/length_reward": 0.48571428954601287, "rewards/slop_reward": 0.7214285731315613, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 200.94286499023437, "epoch": 0.04, "grad_norm": 210.10906643069993, "kl": 0.006036376953125, "learning_rate": 5.333333333333333e-07, "loss": 0.0, "num_tokens": 423267.0, "reward": 1.5157490253448487, "reward_std": 0.4267912685871124, "rewards/classifier_reward": 0.32646322101354597, "rewards/length_reward": 0.37142857611179353, "rewards/slop_reward": 0.8178571403026581, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 226.17143859863282, "epoch": 0.0425, "grad_norm": 10.900066010271587, "kl": 0.0018096923828125, "learning_rate": 5.666666666666666e-07, "loss": 0.0, "num_tokens": 449061.0, "reward": 1.5792103052139281, "reward_std": 0.5069970846176147, "rewards/classifier_reward": 0.13992448002099991, "rewards/length_reward": 0.6857142925262452, "rewards/slop_reward": 0.7535714209079742, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 167.08572387695312, "epoch": 0.045, "grad_norm": 265.89705750792126, "kl": 0.0086395263671875, "learning_rate": 6e-07, "loss": 0.0, "num_tokens": 472825.0, "reward": 1.8174922943115235, "reward_std": 0.39020195603370667, "rewards/classifier_reward": 0.3174922451376915, "rewards/length_reward": 0.6285714387893677, "rewards/slop_reward": 0.8714285731315613, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 150.85714721679688, "epoch": 0.0475, "grad_norm": 16.75575953327707, "kl": 0.0071929931640625, "learning_rate": 6.333333333333332e-07, "loss": 0.0, "num_tokens": 496025.0, "reward": 1.9033495664596558, "reward_std": 0.46038708090782166, "rewards/classifier_reward": 0.5676352053880691, "rewards/length_reward": 0.4000000059604645, "rewards/slop_reward": 0.9357142806053161, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 170.20000610351562, "epoch": 0.05, "grad_norm": 12.507998039364391, "kl": 0.008447265625, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "num_tokens": 519775.0, "reward": 1.8573363780975343, "reward_std": 0.46103876233100893, "rewards/classifier_reward": 0.3716219961643219, "rewards/length_reward": 0.6571428596973419, "rewards/slop_reward": 0.8285714149475097, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 254.57144317626953, "epoch": 0.0525, "grad_norm": 10.08679593336001, "kl": 0.005413818359375, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 546566.0, "reward": 1.2423678815364838, "reward_std": 0.4237713754177094, "rewards/classifier_reward": 0.2472785457968712, "rewards/length_reward": 0.45714286267757415, "rewards/slop_reward": 0.5379464238882065, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 231.60000915527343, "epoch": 0.055, "grad_norm": 120.69825805503362, "kl": 0.01463623046875, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "num_tokens": 572155.0, "reward": 1.6102877855300903, "reward_std": 0.47996904850006106, "rewards/classifier_reward": 0.39421619176864625, "rewards/length_reward": 0.45714286267757415, "rewards/slop_reward": 0.7589285731315613, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 244.971435546875, "epoch": 0.0575, "grad_norm": 10.506951478641199, "kl": 0.0114501953125, "learning_rate": 7.666666666666667e-07, "loss": 0.0, "num_tokens": 598457.0, "reward": 1.7265684366226197, "reward_std": 0.5010857343673706, "rewards/classifier_reward": 0.35513979494571685, "rewards/length_reward": 0.542857152223587, "rewards/slop_reward": 0.8285714268684388, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 212.17143859863282, "epoch": 0.06, "grad_norm": 14.496994349875411, "kl": 0.019873046875, "learning_rate": 8e-07, "loss": 0.0, "num_tokens": 623795.0, "reward": 1.9756683349609374, "reward_std": 0.6098839461803436, "rewards/classifier_reward": 0.3970968216657639, "rewards/length_reward": 0.6857142925262452, "rewards/slop_reward": 0.8928571343421936, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 149.71429138183595, "epoch": 0.0625, "grad_norm": 13.955034723905843, "kl": 0.023046875, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "num_tokens": 646708.0, "reward": 1.7968461036682128, "reward_std": 0.45076006054878237, "rewards/classifier_reward": 0.3789888650178909, "rewards/length_reward": 0.5142857193946838, "rewards/slop_reward": 0.9035714268684387, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 172.02857818603516, "epoch": 0.065, "grad_norm": 13.086801218372846, "kl": 0.03857421875, "learning_rate": 8.666666666666667e-07, "loss": 0.0, "num_tokens": 670238.0, "reward": 2.018597435951233, "reward_std": 0.5030077040195465, "rewards/classifier_reward": 0.44716874957084657, "rewards/length_reward": 0.6571428656578064, "rewards/slop_reward": 0.9142857074737549, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 139.0571487426758, "epoch": 0.0675, "grad_norm": 13.874334958965811, "kl": 0.044140625, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 693025.0, "reward": 1.3813447833061219, "reward_std": 0.4743997871875763, "rewards/classifier_reward": 0.3313447292894125, "rewards/length_reward": 0.28571428954601286, "rewards/slop_reward": 0.7642857193946838, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 312.0857269287109, "epoch": 0.07, "grad_norm": 9.049982563385965, "kl": 0.0622314453125, "learning_rate": 9.333333333333333e-07, "loss": 0.0001, "num_tokens": 721725.0, "reward": 1.688427746295929, "reward_std": 0.4802214980125427, "rewards/classifier_reward": 0.48485626801848414, "rewards/length_reward": 0.5142857193946838, "rewards/slop_reward": 0.6892857074737548, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 170.34286499023438, "epoch": 0.0725, "grad_norm": 13.386821366059916, "kl": 0.100341796875, "learning_rate": 9.666666666666666e-07, "loss": 0.0001, "num_tokens": 745420.0, "reward": 1.488853096961975, "reward_std": 0.4731449127197266, "rewards/classifier_reward": 0.2870673179626465, "rewards/length_reward": 0.31428571939468386, "rewards/slop_reward": 0.8875, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 250.0571502685547, "epoch": 0.075, "grad_norm": 11.769241786475325, "kl": 0.08681640625, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 771967.0, "reward": 1.6088370084762573, "reward_std": 0.5624471366405487, "rewards/classifier_reward": 0.2784797720611095, "rewards/length_reward": 0.4857142925262451, "rewards/slop_reward": 0.8446428537368774, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 203.6285858154297, "epoch": 0.0775, "grad_norm": 15.786130440761754, "kl": 0.120703125, "learning_rate": 9.999819767255174e-07, "loss": 0.0001, "num_tokens": 797014.0, "reward": 1.9446904182434082, "reward_std": 0.36648078858852384, "rewards/classifier_reward": 0.4661188304424286, "rewards/length_reward": 0.5428571462631225, "rewards/slop_reward": 0.9357142806053161, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 201.771435546875, "epoch": 0.08, "grad_norm": 15.903852139666391, "kl": 0.3494140625, "learning_rate": 9.999279082014231e-07, "loss": 0.0003, "num_tokens": 821995.0, "reward": 1.9787015676498414, "reward_std": 0.585156524181366, "rewards/classifier_reward": 0.5215587019920349, "rewards/length_reward": 0.5428571552038193, "rewards/slop_reward": 0.9142857074737549, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 224.42858276367187, "epoch": 0.0825, "grad_norm": 29.64188533235313, "kl": 0.500390625, "learning_rate": 9.998377983256848e-07, "loss": 0.0005, "num_tokens": 847628.0, "reward": 2.037756896018982, "reward_std": 0.5854993224143982, "rewards/classifier_reward": 0.45918539762496946, "rewards/length_reward": 0.6857142984867096, "rewards/slop_reward": 0.8928571343421936, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 213.91429443359374, "epoch": 0.085, "grad_norm": 13.883875104448363, "kl": 1.748828125, "learning_rate": 9.997116535946027e-07, "loss": 0.0018, "num_tokens": 872871.0, "reward": 1.7733258247375487, "reward_std": 0.5001925647258758, "rewards/classifier_reward": 0.4197543442249298, "rewards/length_reward": 0.6857142955064773, "rewards/slop_reward": 0.6678571343421936, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 229.771435546875, "epoch": 0.0875, "grad_norm": 14.166455798732835, "kl": 1.48515625, "learning_rate": 9.995494831023408e-07, "loss": 0.0015, "num_tokens": 898833.0, "reward": 2.2283714771270753, "reward_std": 0.4383322179317474, "rewards/classifier_reward": 0.6283714175224304, "rewards/length_reward": 0.6857142865657806, "rewards/slop_reward": 0.9142857074737549, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 240.5428680419922, "epoch": 0.09, "grad_norm": 20.07276091542321, "kl": 1.05703125, "learning_rate": 9.993512985402724e-07, "loss": 0.0011, "num_tokens": 924948.0, "reward": 1.8037936687469482, "reward_std": 0.4002750262618065, "rewards/classifier_reward": 0.4395078897476196, "rewards/length_reward": 0.4285714328289032, "rewards/slop_reward": 0.9357142806053161, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 249.42858123779297, "epoch": 0.0925, "grad_norm": 16.81239416666408, "kl": 7.96875, "learning_rate": 9.991171141961368e-07, "loss": 0.0085, "num_tokens": 951092.0, "reward": 1.9397377490997314, "reward_std": 0.4508820950984955, "rewards/classifier_reward": 0.6325947523117066, "rewards/length_reward": 0.4571428656578064, "rewards/slop_reward": 0.8499999880790711, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 196.17143707275392, "epoch": 0.095, "grad_norm": 19.19953442548882, "kl": 3.775, "learning_rate": 9.988469469530085e-07, "loss": 0.0038, "num_tokens": 975652.0, "reward": 2.0388022661209106, "reward_std": 0.3248827219009399, "rewards/classifier_reward": 0.32451642835512756, "rewards/length_reward": 0.7142857193946839, "rewards/slop_reward": 1.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 352.22858276367185, "epoch": 0.0975, "grad_norm": 18.25441612913441, "kl": 0.79091796875, "learning_rate": 9.985408162880813e-07, "loss": 0.0008, "num_tokens": 1005900.0, "reward": 1.6981183767318726, "reward_std": 0.433578160405159, "rewards/classifier_reward": 0.5351718068122864, "rewards/length_reward": 0.40000000298023225, "rewards/slop_reward": 0.7629464268684387, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 321.02858276367186, "epoch": 0.1, "grad_norm": 10.54620387526757, "kl": 0.754296875, "learning_rate": 9.98198744271263e-07, "loss": 0.0008, "num_tokens": 1034877.0, "reward": 1.9676434993743896, "reward_std": 0.4024129122495651, "rewards/classifier_reward": 0.6390719175338745, "rewards/length_reward": 0.4571428656578064, "rewards/slop_reward": 0.8714285612106323, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 214.17143859863282, "epoch": 0.1025, "grad_norm": 13.824754472854506, "kl": 0.911328125, "learning_rate": 9.978207555635855e-07, "loss": 0.0009, "num_tokens": 1060157.0, "reward": 2.1357697248458862, "reward_std": 0.5395443201065063, "rewards/classifier_reward": 0.6357696294784546, "rewards/length_reward": 0.5428571552038193, "rewards/slop_reward": 0.9571428537368775, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 223.6285827636719, "epoch": 0.105, "grad_norm": 18.073667394788664, "kl": 0.519921875, "learning_rate": 9.97406877415425e-07, "loss": 0.0005, "num_tokens": 1085893.0, "reward": 2.068192982673645, "reward_std": 0.4686335951089859, "rewards/classifier_reward": 0.4967642992734909, "rewards/length_reward": 0.6571428775787354, "rewards/slop_reward": 0.9142857074737549, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 297.40001220703124, "epoch": 0.1075, "grad_norm": 10.8725727731831, "kl": 0.43515625, "learning_rate": 9.9695713966454e-07, "loss": 0.0004, "num_tokens": 1114056.0, "reward": 1.6727801322937013, "reward_std": 0.501282411813736, "rewards/classifier_reward": 0.501351535320282, "rewards/length_reward": 0.3428571492433548, "rewards/slop_reward": 0.8285714149475097, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 204.85715026855468, "epoch": 0.11, "grad_norm": 11.442387542173964, "kl": 0.574609375, "learning_rate": 9.964715747339175e-07, "loss": 0.0006, "num_tokens": 1138804.0, "reward": 2.027732276916504, "reward_std": 0.6377828001976014, "rewards/classifier_reward": 0.6545179545879364, "rewards/length_reward": 0.48571430146694183, "rewards/slop_reward": 0.8875, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 290.6285888671875, "epoch": 0.1125, "grad_norm": 11.010885455058528, "kl": 0.451171875, "learning_rate": 9.959502176294382e-07, "loss": 0.0005, "num_tokens": 1166842.0, "reward": 1.8717997074127197, "reward_std": 0.45424606651067734, "rewards/classifier_reward": 0.45037112236022947, "rewards/length_reward": 0.4857142925262451, "rewards/slop_reward": 0.9357142806053161, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 251.60001220703126, "epoch": 0.115, "grad_norm": 10.74794734294439, "kl": 0.378125, "learning_rate": 9.953931059373511e-07, "loss": 0.0004, "num_tokens": 1193568.0, "reward": 2.1009025812149047, "reward_std": 0.5576321303844451, "rewards/classifier_reward": 0.6455453038215637, "rewards/length_reward": 0.6000000059604644, "rewards/slop_reward": 0.8553571462631225, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 205.51429595947266, "epoch": 0.1175, "grad_norm": 12.174860136090478, "kl": 0.580859375, "learning_rate": 9.948002798215631e-07, "loss": 0.0006, "num_tokens": 1218520.0, "reward": 1.7478339910507201, "reward_std": 0.44800390899181364, "rewards/classifier_reward": 0.5246196419000626, "rewards/length_reward": 0.4000000059604645, "rewards/slop_reward": 0.8232142806053162, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 206.371435546875, "epoch": 0.12, "grad_norm": 10.734345225393307, "kl": 0.77734375, "learning_rate": 9.94171782020746e-07, "loss": 0.0008, "num_tokens": 1243553.0, "reward": 2.33663330078125, "reward_std": 0.5126991689205169, "rewards/classifier_reward": 0.6152046620845795, "rewards/length_reward": 0.7428571522235871, "rewards/slop_reward": 0.9785714268684387, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 349.8000244140625, "epoch": 0.1225, "grad_norm": 9.516105487183257, "kl": 0.6921875, "learning_rate": 9.935076578452534e-07, "loss": 0.0007, "num_tokens": 1273677.0, "reward": 1.6252358913421632, "reward_std": 0.4947394013404846, "rewards/classifier_reward": 0.5180929381400347, "rewards/length_reward": 0.25714286267757414, "rewards/slop_reward": 0.85, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 229.57143859863282, "epoch": 0.125, "grad_norm": 11.582425399894849, "kl": 1.07734375, "learning_rate": 9.928079551738541e-07, "loss": 0.0011, "num_tokens": 1299334.0, "reward": 2.2410747528076174, "reward_std": 0.5065959393978119, "rewards/classifier_reward": 0.5834853827953339, "rewards/length_reward": 0.6857143044471741, "rewards/slop_reward": 0.971875, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 254.68572692871095, "epoch": 0.1275, "grad_norm": 9.691651370929392, "kl": 0.7765625, "learning_rate": 9.920727244502818e-07, "loss": 0.0008, "num_tokens": 1326112.0, "reward": 1.8951802968978881, "reward_std": 0.561316728591919, "rewards/classifier_reward": 0.4666088119149208, "rewards/length_reward": 0.5142857283353806, "rewards/slop_reward": 0.9142857074737549, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 245.82858581542968, "epoch": 0.13, "grad_norm": 9.231142781375425, "kl": 0.4640625, "learning_rate": 9.913020186795966e-07, "loss": 0.0005, "num_tokens": 1352635.0, "reward": 2.4359071254730225, "reward_std": 0.504255086183548, "rewards/classifier_reward": 0.8644783020019531, "rewards/length_reward": 0.6571428656578064, "rewards/slop_reward": 0.9142857074737549, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 336.2857360839844, "epoch": 0.1325, "grad_norm": 8.070502355443343, "kl": 1.01142578125, "learning_rate": 9.904958934243653e-07, "loss": 0.001, "num_tokens": 1382325.0, "reward": 1.9690003395080566, "reward_std": 0.4140917003154755, "rewards/classifier_reward": 0.595339572429657, "rewards/length_reward": 0.5142857253551483, "rewards/slop_reward": 0.859375, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 176.85714721679688, "epoch": 0.135, "grad_norm": 13.273584784804553, "kl": 0.737109375, "learning_rate": 9.89654406800655e-07, "loss": 0.0007, "num_tokens": 1406364.0, "reward": 2.015079474449158, "reward_std": 0.34893424808979034, "rewards/classifier_reward": 0.6650793373584747, "rewards/length_reward": 0.37142857611179353, "rewards/slop_reward": 0.9785714268684387, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 270.77144317626954, "epoch": 0.1375, "grad_norm": 10.259811273773323, "kl": 0.41875, "learning_rate": 9.887776194738431e-07, "loss": 0.0004, "num_tokens": 1433517.0, "reward": 2.0523552179336546, "reward_std": 0.43476098477840425, "rewards/classifier_reward": 0.72378368973732, "rewards/length_reward": 0.37142857909202576, "rewards/slop_reward": 0.9571428537368775, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 270.20001220703125, "epoch": 0.14, "grad_norm": 98.33006992571028, "kl": 25.924609375, "learning_rate": 9.878655946542442e-07, "loss": 0.0258, "num_tokens": 1460894.0, "reward": 2.2650604486465453, "reward_std": 0.5314578056335449, "rewards/classifier_reward": 0.7382745862007141, "rewards/length_reward": 0.6285714358091354, "rewards/slop_reward": 0.8982142806053162, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 213.82858581542968, "epoch": 0.1425, "grad_norm": 11.336151350522224, "kl": 0.592578125, "learning_rate": 9.86918398092553e-07, "loss": 0.0006, "num_tokens": 1486239.0, "reward": 2.248945116996765, "reward_std": 0.4141096830368042, "rewards/classifier_reward": 0.6918022215366364, "rewards/length_reward": 0.6000000059604644, "rewards/slop_reward": 0.9571428537368775, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 301.85715637207034, "epoch": 0.145, "grad_norm": 9.956188328948622, "kl": 0.44765625, "learning_rate": 9.85936098075104e-07, "loss": 0.0004, "num_tokens": 1514661.0, "reward": 1.9887679100036622, "reward_std": 0.47130251824855807, "rewards/classifier_reward": 0.6173391878604889, "rewards/length_reward": 0.45714286267757415, "rewards/slop_reward": 0.9142857074737549, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 194.42857971191407, "epoch": 0.1475, "grad_norm": 10.413710834913637, "kl": 0.428125, "learning_rate": 9.849187654189485e-07, "loss": 0.0004, "num_tokens": 1539249.0, "reward": 2.1201124668121336, "reward_std": 0.4662812829017639, "rewards/classifier_reward": 0.8486838459968566, "rewards/length_reward": 0.3142857253551483, "rewards/slop_reward": 0.9571428537368775, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 223.71429443359375, "epoch": 0.15, "grad_norm": 9.48253169035506, "kl": 0.6697265625, "learning_rate": 9.838664734667495e-07, "loss": 0.0007, "num_tokens": 1564932.0, "reward": 2.3097215414047243, "reward_std": 0.37846060991287234, "rewards/classifier_reward": 0.6097213685512543, "rewards/length_reward": 0.7428571492433548, "rewards/slop_reward": 0.9571428537368775, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 213.08572235107422, "epoch": 0.1525, "grad_norm": 17.056291850314448, "kl": 0.484375, "learning_rate": 9.827792980814933e-07, "loss": 0.0005, "num_tokens": 1590245.0, "reward": 2.139114594459534, "reward_std": 0.3371311604976654, "rewards/classifier_reward": 0.6605431139469147, "rewards/length_reward": 0.5428571492433548, "rewards/slop_reward": 0.9357142806053161, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 224.68572540283202, "epoch": 0.155, "grad_norm": 9.724576982946711, "kl": 0.580859375, "learning_rate": 9.81657317641022e-07, "loss": 0.0006, "num_tokens": 1615704.0, "reward": 2.022816562652588, "reward_std": 0.4144854575395584, "rewards/classifier_reward": 0.5585307866334915, "rewards/length_reward": 0.4857142955064774, "rewards/slop_reward": 0.9785714268684387, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 343.8285888671875, "epoch": 0.1575, "grad_norm": 7.143874630694377, "kl": 0.4015625, "learning_rate": 9.805006130323808e-07, "loss": 0.0004, "num_tokens": 1645654.0, "reward": 2.0719400882720946, "reward_std": 0.3876799166202545, "rewards/classifier_reward": 0.6647971898317337, "rewards/length_reward": 0.42857143878936765, "rewards/slop_reward": 0.9785714268684387, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 266.00001220703126, "epoch": 0.16, "grad_norm": 9.518081765887688, "kl": 0.3986328125, "learning_rate": 9.793092676459888e-07, "loss": 0.0004, "num_tokens": 1672883.0, "reward": 2.084528160095215, "reward_std": 0.473650124669075, "rewards/classifier_reward": 0.584528061747551, "rewards/length_reward": 0.5428571552038193, "rewards/slop_reward": 0.9571428537368775, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 279.71429748535155, "epoch": 0.1625, "grad_norm": 8.015308302511995, "kl": 0.54375, "learning_rate": 9.780833673696254e-07, "loss": 0.0005, "num_tokens": 1700438.0, "reward": 2.3380573272705076, "reward_std": 0.4121220216155052, "rewards/classifier_reward": 0.8237714767456055, "rewards/length_reward": 0.5142857223749161, "rewards/slop_reward": 1.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 248.31430053710938, "epoch": 0.165, "grad_norm": 8.749621821849056, "kl": 0.421484375, "learning_rate": 9.768230005822393e-07, "loss": 0.0004, "num_tokens": 1727047.0, "reward": 2.2453027963638306, "reward_std": 0.38129588067531583, "rewards/classifier_reward": 0.781016880273819, "rewards/length_reward": 0.48571428954601287, "rewards/slop_reward": 0.9785714268684387, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 283.3142974853516, "epoch": 0.1675, "grad_norm": 15.835663084851635, "kl": 0.4734375, "learning_rate": 9.755282581475767e-07, "loss": 0.0005, "num_tokens": 1754863.0, "reward": 2.350207304954529, "reward_std": 0.5121555209159852, "rewards/classifier_reward": 0.7930643558502197, "rewards/length_reward": 0.6000000149011612, "rewards/slop_reward": 0.9571428537368775, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 207.0571502685547, "epoch": 0.17, "grad_norm": 10.560189239731793, "kl": 0.418359375, "learning_rate": 9.741992334076308e-07, "loss": 0.0004, "num_tokens": 1780017.0, "reward": 2.56486029624939, "reward_std": 0.4183764517307281, "rewards/classifier_reward": 0.7505744874477387, "rewards/length_reward": 0.8571428656578064, "rewards/slop_reward": 0.9571428537368775, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 207.31429443359374, "epoch": 0.1725, "grad_norm": 12.483812322806989, "kl": 0.690234375, "learning_rate": 9.728360221759123e-07, "loss": 0.0007, "num_tokens": 1805172.0, "reward": 2.421455478668213, "reward_std": 0.5775963604450226, "rewards/classifier_reward": 0.7643125176429748, "rewards/length_reward": 0.7428571522235871, "rewards/slop_reward": 0.9142857074737549, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 262.8285858154297, "epoch": 0.175, "grad_norm": 10.184314056992156, "kl": 0.665234375, "learning_rate": 9.71438722730542e-07, "loss": 0.0007, "num_tokens": 1832291.0, "reward": 2.078964352607727, "reward_std": 0.5653822362422943, "rewards/classifier_reward": 0.6146785497665406, "rewards/length_reward": 0.4857142984867096, "rewards/slop_reward": 0.9785714268684387, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 338.9428741455078, "epoch": 0.1775, "grad_norm": 8.806960986372623, "kl": 0.46015625, "learning_rate": 9.700074358071656e-07, "loss": 0.0005, "num_tokens": 1861997.0, "reward": 2.0550333499908446, "reward_std": 0.5070074677467347, "rewards/classifier_reward": 0.7263502657413483, "rewards/length_reward": 0.40000001192092893, "rewards/slop_reward": 0.9286830306053162, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 270.0857299804687, "epoch": 0.18, "grad_norm": 9.059350132300025, "kl": 0.450390625, "learning_rate": 9.685422645916918e-07, "loss": 0.0005, "num_tokens": 1888854.0, "reward": 2.531459331512451, "reward_std": 0.3892214775085449, "rewards/classifier_reward": 0.860030734539032, "rewards/length_reward": 0.7142857193946839, "rewards/slop_reward": 0.9571428537368775, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 291.4285827636719, "epoch": 0.1825, "grad_norm": 12.813964228104469, "kl": 0.52265625, "learning_rate": 9.670433147128521e-07, "loss": 0.0005, "num_tokens": 1916974.0, "reward": 2.2648436784744264, "reward_std": 0.40854659080505373, "rewards/classifier_reward": 0.8577007412910461, "rewards/length_reward": 0.42857143878936765, "rewards/slop_reward": 0.9785714268684387, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 246.31429748535157, "epoch": 0.185, "grad_norm": 241.73474838722862, "kl": 0.453125, "learning_rate": 9.655106942345868e-07, "loss": 0.0005, "num_tokens": 1943440.0, "reward": 2.366797590255737, "reward_std": 0.41730722188949587, "rewards/classifier_reward": 0.766797399520874, "rewards/length_reward": 0.600000011920929, "rewards/slop_reward": 1.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 250.91429443359374, "epoch": 0.1875, "grad_norm": 8.395335843546244, "kl": 0.5015625, "learning_rate": 9.639445136482546e-07, "loss": 0.0005, "num_tokens": 1969900.0, "reward": 2.4618414878845214, "reward_std": 0.540682977437973, "rewards/classifier_reward": 0.8261271595954895, "rewards/length_reward": 0.6571428716182709, "rewards/slop_reward": 0.9785714268684387, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 247.11429748535156, "epoch": 0.19, "grad_norm": 9.31969072416839, "kl": 0.53671875, "learning_rate": 9.623448858646656e-07, "loss": 0.0005, "num_tokens": 1996327.0, "reward": 2.259377145767212, "reward_std": 0.4671668648719788, "rewards/classifier_reward": 0.916519820690155, "rewards/length_reward": 0.3428571581840515, "rewards/slop_reward": 1.0, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 316.9714416503906, "epoch": 0.1925, "grad_norm": 7.365001527017238, "kl": 1.408984375, "learning_rate": 9.607119262059425e-07, "loss": 0.0014, "num_tokens": 2024968.0, "reward": 2.266000509262085, "reward_std": 0.5217303335666656, "rewards/classifier_reward": 0.8231433391571045, "rewards/length_reward": 0.48571430146694183, "rewards/slop_reward": 0.9571428537368775, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 228.62857971191406, "epoch": 0.195, "grad_norm": 45.20061919834423, "kl": 23.44453125, "learning_rate": 9.590457523972055e-07, "loss": 0.0236, "num_tokens": 2050735.0, "reward": 2.3131242275238035, "reward_std": 0.5057340741157532, "rewards/classifier_reward": 0.8827670216560364, "rewards/length_reward": 0.4571428656578064, "rewards/slop_reward": 0.9732142806053161, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 217.91429443359374, "epoch": 0.1975, "grad_norm": 9.435990839066255, "kl": 0.4484375, "learning_rate": 9.573464845580863e-07, "loss": 0.0004, "num_tokens": 2076160.0, "reward": 2.3861000537872314, "reward_std": 0.6137513637542724, "rewards/classifier_reward": 0.8432427525520325, "rewards/length_reward": 0.5428571581840516, "rewards/slop_reward": 1.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 250.91429443359374, "epoch": 0.2, "grad_norm": 7.98212579749696, "kl": 0.5078125, "learning_rate": 9.556142451940679e-07, "loss": 0.0005, "num_tokens": 2102862.0, "reward": 2.2402546644210815, "reward_std": 0.602302199602127, "rewards/classifier_reward": 0.7473974108695984, "rewards/length_reward": 0.600000011920929, "rewards/slop_reward": 0.8928571343421936, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 287.34287414550784, "epoch": 0.2025, "grad_norm": 8.04736588295688, "kl": 0.61015625, "learning_rate": 9.53849159187652e-07, "loss": 0.0006, "num_tokens": 2130556.0, "reward": 2.3916960954666138, "reward_std": 0.45145381689071656, "rewards/classifier_reward": 0.9631245970726013, "rewards/length_reward": 0.5142857253551483, "rewards/slop_reward": 0.9142857074737549, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 264.02857666015626, "epoch": 0.205, "grad_norm": 8.203747363662693, "kl": 0.523828125, "learning_rate": 9.520513537893573e-07, "loss": 0.0005, "num_tokens": 2157656.0, "reward": 2.4363236665725707, "reward_std": 0.32723597437143326, "rewards/classifier_reward": 0.7720378637313843, "rewards/length_reward": 0.6857142925262452, "rewards/slop_reward": 0.9785714268684387, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 229.02858276367186, "epoch": 0.2075, "grad_norm": 7.89698513820585, "kl": 0.556640625, "learning_rate": 9.502209586085442e-07, "loss": 0.0006, "num_tokens": 2183592.0, "reward": 2.6267528533935547, "reward_std": 0.41746888160705564, "rewards/classifier_reward": 0.9338955879211426, "rewards/length_reward": 0.7142857253551483, "rewards/slop_reward": 0.9785714268684387, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 224.37144165039064, "epoch": 0.21, "grad_norm": 8.428058041901203, "kl": 0.534765625, "learning_rate": 9.483581056040718e-07, "loss": 0.0005, "num_tokens": 2209201.0, "reward": 2.4542306900024413, "reward_std": 0.4101273000240326, "rewards/classifier_reward": 0.9113734126091003, "rewards/length_reward": 0.5428571552038193, "rewards/slop_reward": 1.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 257.71429443359375, "epoch": 0.2125, "grad_norm": 9.035933890702934, "kl": 0.5140625, "learning_rate": 9.464629290747842e-07, "loss": 0.0005, "num_tokens": 2236141.0, "reward": 2.5825096130371095, "reward_std": 0.45454559922218324, "rewards/classifier_reward": 0.9753666520118713, "rewards/length_reward": 0.6285714417695999, "rewards/slop_reward": 0.9785714268684387, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 246.74286804199218, "epoch": 0.215, "grad_norm": 9.318935519161128, "kl": 0.555078125, "learning_rate": 9.445355656498284e-07, "loss": 0.0006, "num_tokens": 2262697.0, "reward": 2.383002519607544, "reward_std": 0.5029416978359222, "rewards/classifier_reward": 0.8830024480819703, "rewards/length_reward": 0.5428571611642837, "rewards/slop_reward": 0.9571428537368775, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 324.9428741455078, "epoch": 0.2175, "grad_norm": 10.476800992230684, "kl": 0.515234375, "learning_rate": 9.425761542788048e-07, "loss": 0.001, "num_tokens": 2291611.0, "reward": 2.4253102779388427, "reward_std": 0.25630177855491637, "rewards/classifier_reward": 0.9878101944923401, "rewards/length_reward": 0.4857142955064774, "rewards/slop_reward": 0.9517857193946838, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 285.88572387695314, "epoch": 0.22, "grad_norm": 8.262503873333891, "kl": 0.623828125, "learning_rate": 9.40584836221749e-07, "loss": 0.0006, "num_tokens": 2319400.0, "reward": 2.459568977355957, "reward_std": 0.5227903485298157, "rewards/classifier_reward": 0.9595688700675964, "rewards/length_reward": 0.5428571581840516, "rewards/slop_reward": 0.9571428537368775, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 361.48572998046876, "epoch": 0.2225, "grad_norm": 8.639609161679585, "kl": 0.61015625, "learning_rate": 9.385617550389489e-07, "loss": 0.0006, "num_tokens": 2349972.0, "reward": 2.21625759601593, "reward_std": 0.43572868704795836, "rewards/classifier_reward": 0.8430432677268982, "rewards/length_reward": 0.4000000089406967, "rewards/slop_reward": 0.9732142806053161, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 230.5428680419922, "epoch": 0.225, "grad_norm": 9.545010126368608, "kl": 0.606640625, "learning_rate": 9.36507056580594e-07, "loss": 0.0006, "num_tokens": 2375941.0, "reward": 2.5845241069793703, "reward_std": 0.4674242250621319, "rewards/classifier_reward": 0.9559526205062866, "rewards/length_reward": 0.6285714387893677, "rewards/slop_reward": 1.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 255.971435546875, "epoch": 0.2275, "grad_norm": 8.313767260844978, "kl": 0.705859375, "learning_rate": 9.34420888976262e-07, "loss": 0.0007, "num_tokens": 2402820.0, "reward": 2.281861972808838, "reward_std": 0.5015565395355225, "rewards/classifier_reward": 0.9032904744148255, "rewards/length_reward": 0.48571430444717406, "rewards/slop_reward": 0.8928571343421936, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 285.71430053710935, "epoch": 0.23, "grad_norm": 15.296685695089757, "kl": 12.5765625, "learning_rate": 9.323034026242377e-07, "loss": 0.0126, "num_tokens": 2430740.0, "reward": 2.457769823074341, "reward_std": 0.4592562437057495, "rewards/classifier_reward": 0.9363411664962769, "rewards/length_reward": 0.542857152223587, "rewards/slop_reward": 0.9785714268684387, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 283.9143035888672, "epoch": 0.2325, "grad_norm": 41.32916438788783, "kl": 0.500390625, "learning_rate": 9.301547501806724e-07, "loss": 0.0005, "num_tokens": 2458445.0, "reward": 2.432806062698364, "reward_std": 0.4543603718280792, "rewards/classifier_reward": 0.8256630301475525, "rewards/length_reward": 0.6285714387893677, "rewards/slop_reward": 0.9785714268684387, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 231.2571533203125, "epoch": 0.235, "grad_norm": 8.514937250494981, "kl": 0.56875, "learning_rate": 9.279750865485772e-07, "loss": 0.001, "num_tokens": 2484459.0, "reward": 2.6583719730377195, "reward_std": 0.41457981467247007, "rewards/classifier_reward": 0.9726575374603271, "rewards/length_reward": 0.6857142984867096, "rewards/slop_reward": 1.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 239.4571563720703, "epoch": 0.2375, "grad_norm": 8.130278831610969, "kl": 0.586328125, "learning_rate": 9.257645688666555e-07, "loss": 0.0006, "num_tokens": 2510647.0, "reward": 2.552411127090454, "reward_std": 0.444735050201416, "rewards/classifier_reward": 0.9220538377761841, "rewards/length_reward": 0.6571428656578064, "rewards/slop_reward": 0.9732142806053161, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 248.48572692871093, "epoch": 0.24, "grad_norm": 11.012318216722969, "kl": 0.559375, "learning_rate": 9.235233564979754e-07, "loss": 0.0006, "num_tokens": 2537166.0, "reward": 2.3741667747497557, "reward_std": 0.5231991052627564, "rewards/classifier_reward": 0.7955952703952789, "rewards/length_reward": 0.600000011920929, "rewards/slop_reward": 0.9785714268684387, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 242.91429748535157, "epoch": 0.2425, "grad_norm": 9.607401037517901, "kl": 0.503125, "learning_rate": 9.212516110184794e-07, "loss": 0.0005, "num_tokens": 2563588.0, "reward": 2.6350881576538088, "reward_std": 0.47173853516578673, "rewards/classifier_reward": 0.8618737578392028, "rewards/length_reward": 0.8000000059604645, "rewards/slop_reward": 0.9732142806053161, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 193.971435546875, "epoch": 0.245, "grad_norm": 10.623433793787921, "kl": 0.7234375, "learning_rate": 9.189494962053368e-07, "loss": 0.0007, "num_tokens": 2588297.0, "reward": 2.389581322669983, "reward_std": 0.4812875479459763, "rewards/classifier_reward": 0.903866958618164, "rewards/length_reward": 0.4857142984867096, "rewards/slop_reward": 1.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 171.51429443359376, "epoch": 0.2475, "grad_norm": 11.438435269316447, "kl": 0.64375, "learning_rate": 9.166171780251364e-07, "loss": 0.0006, "num_tokens": 2612015.0, "reward": 2.513836717605591, "reward_std": 0.26052397638559344, "rewards/classifier_reward": 0.8852650642395019, "rewards/length_reward": 0.6285714328289032, "rewards/slop_reward": 1.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 167.14286346435546, "epoch": 0.25, "grad_norm": 11.041028322429328, "kl": 0.7359375, "learning_rate": 9.14254824621921e-07, "loss": 0.0007, "num_tokens": 2635785.0, "reward": 2.355943202972412, "reward_std": 0.33155601024627684, "rewards/classifier_reward": 0.8416573882102967, "rewards/length_reward": 0.5142857193946838, "rewards/slop_reward": 1.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 163.74286499023438, "epoch": 0.2525, "grad_norm": 9.00301084087431, "kl": 0.75390625, "learning_rate": 9.118626063050661e-07, "loss": 0.0012, "num_tokens": 2659436.0, "reward": 2.5006046295166016, "reward_std": 0.4252849280834198, "rewards/classifier_reward": 0.9506044745445251, "rewards/length_reward": 0.571428582072258, "rewards/slop_reward": 0.9785714268684387, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 195.42858276367187, "epoch": 0.255, "grad_norm": 9.702866580028527, "kl": 0.79921875, "learning_rate": 9.094406955370008e-07, "loss": 0.0008, "num_tokens": 2683861.0, "reward": 2.5624767780303954, "reward_std": 0.47246721386909485, "rewards/classifier_reward": 0.9267622709274292, "rewards/length_reward": 0.6571428716182709, "rewards/slop_reward": 0.9785714268684387, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 183.8857223510742, "epoch": 0.2575, "grad_norm": 7.999927126827247, "kl": 0.7359375, "learning_rate": 9.069892669207757e-07, "loss": 0.0007, "num_tokens": 2708217.0, "reward": 2.4374377012252806, "reward_std": 0.3513069462031126, "rewards/classifier_reward": 0.8374375879764557, "rewards/length_reward": 0.6000000059604644, "rewards/slop_reward": 1.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 203.74286804199218, "epoch": 0.26, "grad_norm": 8.535286841601573, "kl": 0.6046875, "learning_rate": 9.045084971874737e-07, "loss": 0.0006, "num_tokens": 2733039.0, "reward": 2.590514373779297, "reward_std": 0.3418044149875641, "rewards/classifier_reward": 0.7905142605304718, "rewards/length_reward": 0.8000000059604645, "rewards/slop_reward": 1.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 186.88572387695314, "epoch": 0.2625, "grad_norm": 11.150131541909593, "kl": 2.3078125, "learning_rate": 9.019985651834703e-07, "loss": 0.0023, "num_tokens": 2757500.0, "reward": 2.6002991676330565, "reward_std": 0.47049993872642515, "rewards/classifier_reward": 0.850299060344696, "rewards/length_reward": 0.7714285790920258, "rewards/slop_reward": 0.9785714268684387, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 231.48572692871093, "epoch": 0.265, "grad_norm": 7.346740346808247, "kl": 0.694140625, "learning_rate": 8.994596518575391e-07, "loss": 0.0007, "num_tokens": 2783522.0, "reward": 2.6393914222717285, "reward_std": 0.4599771976470947, "rewards/classifier_reward": 0.8965341806411743, "rewards/length_reward": 0.7428571581840515, "rewards/slop_reward": 1.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 293.51429748535156, "epoch": 0.2675, "grad_norm": 7.782363877670758, "kl": 0.71328125, "learning_rate": 8.968919402478075e-07, "loss": 0.0007, "num_tokens": 2811715.0, "reward": 2.497778224945068, "reward_std": 0.49988613873720167, "rewards/classifier_reward": 0.8692066669464111, "rewards/length_reward": 0.6285714387893677, "rewards/slop_reward": 1.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 202.40000915527344, "epoch": 0.27, "grad_norm": 8.407812702495294, "kl": 0.9671875, "learning_rate": 8.942956154685595e-07, "loss": 0.001, "num_tokens": 2836377.0, "reward": 2.7594990730285645, "reward_std": 0.35219337940216067, "rewards/classifier_reward": 0.9237847208976746, "rewards/length_reward": 0.8571428656578064, "rewards/slop_reward": 0.9785714268684387, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 311.14288024902345, "epoch": 0.2725, "grad_norm": 10.954001826900827, "kl": 0.803515625, "learning_rate": 8.916708646968923e-07, "loss": 0.0008, "num_tokens": 2865187.0, "reward": 2.3341631174087523, "reward_std": 0.3795748669654131, "rewards/classifier_reward": 0.6913058979436755, "rewards/length_reward": 0.6857142925262452, "rewards/slop_reward": 0.9571428537368775, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 295.88572387695314, "epoch": 0.275, "grad_norm": 7.24224369671047, "kl": 0.6296875, "learning_rate": 8.890178771592197e-07, "loss": 0.0006, "num_tokens": 2893081.0, "reward": 2.574794292449951, "reward_std": 0.4517861694097519, "rewards/classifier_reward": 0.9319370150566101, "rewards/length_reward": 0.6857142984867096, "rewards/slop_reward": 0.9571428537368775, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 310.4285858154297, "epoch": 0.2775, "grad_norm": 6.977209722960256, "kl": 1.34609375, "learning_rate": 8.863368441176325e-07, "loss": 0.0013, "num_tokens": 2921771.0, "reward": 2.2888038635253904, "reward_std": 0.6448704779148102, "rewards/classifier_reward": 0.8455002188682557, "rewards/length_reward": 0.5142857253551483, "rewards/slop_reward": 0.9290178537368774, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 251.82858276367188, "epoch": 0.28, "grad_norm": 6.8381698154250525, "kl": 0.86640625, "learning_rate": 8.836279588561081e-07, "loss": 0.0009, "num_tokens": 2948383.0, "reward": 2.4712666511535644, "reward_std": 0.4860814154148102, "rewards/classifier_reward": 0.8141236484050751, "rewards/length_reward": 0.6571428716182709, "rewards/slop_reward": 1.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 275.74287109375, "epoch": 0.2825, "grad_norm": 5.967234970457837, "kl": 0.8515625, "learning_rate": 8.808914166665772e-07, "loss": 0.0013, "num_tokens": 2975877.0, "reward": 2.6856667041778564, "reward_std": 0.38376912772655486, "rewards/classifier_reward": 0.9856665849685669, "rewards/length_reward": 0.7428571522235871, "rewards/slop_reward": 0.9571428537368775, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 291.1428680419922, "epoch": 0.285, "grad_norm": 6.696088875243213, "kl": 0.61484375, "learning_rate": 8.781274148348436e-07, "loss": 0.0006, "num_tokens": 3003901.0, "reward": 2.584765911102295, "reward_std": 0.441849821805954, "rewards/classifier_reward": 0.8704800248146057, "rewards/length_reward": 0.7142857313156128, "rewards/slop_reward": 1.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 286.6000152587891, "epoch": 0.2875, "grad_norm": 5.913822453334042, "kl": 0.678125, "learning_rate": 8.753361526263621e-07, "loss": 0.0007, "num_tokens": 3031852.0, "reward": 2.6028482913970947, "reward_std": 0.2958831213414669, "rewards/classifier_reward": 0.9957053542137146, "rewards/length_reward": 0.6285714387893677, "rewards/slop_reward": 0.9785714268684387, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 292.20001525878905, "epoch": 0.29, "grad_norm": 5.738716860165764, "kl": 0.72578125, "learning_rate": 8.725178312718725e-07, "loss": 0.0012, "num_tokens": 3059999.0, "reward": 2.596400237083435, "reward_std": 0.3507813632488251, "rewards/classifier_reward": 0.953542971611023, "rewards/length_reward": 0.6857142925262452, "rewards/slop_reward": 0.9571428537368775, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 239.1428680419922, "epoch": 0.2925, "grad_norm": 69.40308583594562, "kl": 1.6, "learning_rate": 8.696726539528923e-07, "loss": 0.0021, "num_tokens": 3086289.0, "reward": 2.7766035079956053, "reward_std": 0.3328893929719925, "rewards/classifier_reward": 0.8908890843391418, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 1.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 259.22858276367185, "epoch": 0.295, "grad_norm": 5.40703806614853, "kl": 0.734765625, "learning_rate": 8.668008257870682e-07, "loss": 0.0012, "num_tokens": 3113282.0, "reward": 2.7762694358825684, "reward_std": 0.2866129666566849, "rewards/classifier_reward": 0.9476978421211243, "rewards/length_reward": 0.8285714328289032, "rewards/slop_reward": 1.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 201.91429443359374, "epoch": 0.2975, "grad_norm": 7.252699872821293, "kl": 0.81015625, "learning_rate": 8.639025538133897e-07, "loss": 0.0013, "num_tokens": 3138256.0, "reward": 2.851440095901489, "reward_std": 0.20830639004707335, "rewards/classifier_reward": 0.9085827589035034, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 250.20001525878905, "epoch": 0.3, "grad_norm": 6.062851209216701, "kl": 0.92578125, "learning_rate": 8.609780469772621e-07, "loss": 0.0014, "num_tokens": 3164933.0, "reward": 2.786311960220337, "reward_std": 0.29145972728729247, "rewards/classifier_reward": 0.9005975008010865, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 1.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 199.4857208251953, "epoch": 0.3025, "grad_norm": 8.363986536112053, "kl": 0.746875, "learning_rate": 8.580275161154431e-07, "loss": 0.0007, "num_tokens": 3189764.0, "reward": 2.89967794418335, "reward_std": 0.15721405297517776, "rewards/classifier_reward": 0.9496778607368469, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 218.88572387695314, "epoch": 0.305, "grad_norm": 104.91081313736798, "kl": 17.3171875, "learning_rate": 8.550511739408428e-07, "loss": 0.0182, "num_tokens": 3215345.0, "reward": 2.980521392822266, "reward_std": 0.05153606534004211, "rewards/classifier_reward": 0.9805211901664734, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 222.91430053710937, "epoch": 0.3075, "grad_norm": 7.509956960673518, "kl": 0.934375, "learning_rate": 8.520492350271895e-07, "loss": 0.001, "num_tokens": 3241067.0, "reward": 2.8307112216949464, "reward_std": 0.1734127746662125, "rewards/classifier_reward": 0.8521397054195404, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 240.17144165039062, "epoch": 0.31, "grad_norm": 6.959659369798759, "kl": 1.028125, "learning_rate": 8.490219157935588e-07, "loss": 0.0015, "num_tokens": 3267393.0, "reward": 2.7303539276123048, "reward_std": 0.1922714289277792, "rewards/classifier_reward": 0.9517823934555054, "rewards/length_reward": 0.8000000029802322, "rewards/slop_reward": 0.9785714268684387, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 229.5428680419922, "epoch": 0.3125, "grad_norm": 8.221210886450612, "kl": 1.91640625, "learning_rate": 8.459694344887731e-07, "loss": 0.0019, "num_tokens": 3293186.0, "reward": 2.739912986755371, "reward_std": 0.36319895684719083, "rewards/classifier_reward": 0.9541985750198364, "rewards/length_reward": 0.8285714328289032, "rewards/slop_reward": 0.9571428537368775, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 216.02858276367186, "epoch": 0.315, "grad_norm": 6.737006006501442, "kl": 1.015625, "learning_rate": 8.428920111756657e-07, "loss": 0.0015, "num_tokens": 3318667.0, "reward": 2.7134992361068724, "reward_std": 0.2444542996585369, "rewards/classifier_reward": 0.8563561499118805, "rewards/length_reward": 0.8571428596973419, "rewards/slop_reward": 1.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 226.20001220703125, "epoch": 0.3175, "grad_norm": 4.36908357763648, "kl": 0.96640625, "learning_rate": 8.397898677152172e-07, "loss": 0.0024, "num_tokens": 3344503.0, "reward": 2.9305933475494386, "reward_std": 0.12042829990386963, "rewards/classifier_reward": 0.9591646075248719, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 203.31429443359374, "epoch": 0.32, "grad_norm": 6.1156992776971855, "kl": 0.85703125, "learning_rate": 8.366632277505597e-07, "loss": 0.0018, "num_tokens": 3369294.0, "reward": 2.9620502471923826, "reward_std": 0.07551092505455018, "rewards/classifier_reward": 0.983478581905365, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 220.0285858154297, "epoch": 0.3225, "grad_norm": 6.8222141729156975, "kl": 0.85859375, "learning_rate": 8.335123166908543e-07, "loss": 0.0013, "num_tokens": 3394915.0, "reward": 2.7929779529571532, "reward_std": 0.28287690281867983, "rewards/classifier_reward": 0.9340491890907288, "rewards/length_reward": 0.8857142865657807, "rewards/slop_reward": 0.9732142806053161, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 225.62857971191406, "epoch": 0.325, "grad_norm": 4.536469246397557, "kl": 1.11015625, "learning_rate": 8.303373616950406e-07, "loss": 0.0025, "num_tokens": 3420549.0, "reward": 2.9556642055511473, "reward_std": 0.07710518054664135, "rewards/classifier_reward": 0.9985211491584778, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9571428537368775, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 220.68572387695312, "epoch": 0.3275, "grad_norm": 2.9694599664159407, "kl": 0.9546875, "learning_rate": 8.271385916554604e-07, "loss": 0.0029, "num_tokens": 3445788.0, "reward": 2.996094989776611, "reward_std": 0.010332237184047698, "rewards/classifier_reward": 0.9960947871208191, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 296.20001831054685, "epoch": 0.33, "grad_norm": 4.064839178436649, "kl": 1.1390625, "learning_rate": 8.23916237181355e-07, "loss": 0.0016, "num_tokens": 3474044.0, "reward": 2.4268852710723876, "reward_std": 0.15440489053726197, "rewards/classifier_reward": 0.9697423577308655, "rewards/length_reward": 0.4571428596973419, "rewards/slop_reward": 1.0, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 282.771435546875, "epoch": 0.3325, "grad_norm": 10.750678349276473, "kl": 0.82734375, "learning_rate": 8.206705305822412e-07, "loss": 0.0013, "num_tokens": 3501861.0, "reward": 2.4831544876098635, "reward_std": 0.26082203090190886, "rewards/classifier_reward": 0.9831543445587159, "rewards/length_reward": 0.5428571492433548, "rewards/slop_reward": 0.9571428537368775, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 247.34286804199218, "epoch": 0.335, "grad_norm": 106.5245827902456, "kl": 88.11328125, "learning_rate": 8.174017058511628e-07, "loss": 0.0893, "num_tokens": 3528356.0, "reward": 2.8546416759490967, "reward_std": 0.2578580856323242, "rewards/classifier_reward": 0.9903557300567627, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 0.9785714268684387, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 280.2857299804688, "epoch": 0.3375, "grad_norm": 5.075379696681929, "kl": 1.60859375, "learning_rate": 8.141099986478212e-07, "loss": 0.0021, "num_tokens": 3555922.0, "reward": 2.635714387893677, "reward_std": 0.31418272852897644, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.6571428656578064, "rewards/slop_reward": 0.9785714268684387, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 304.5714416503906, "epoch": 0.34, "grad_norm": 5.929984152853479, "kl": 1.1859375, "learning_rate": 8.107956462815861e-07, "loss": 0.0017, "num_tokens": 3584471.0, "reward": 2.3495986461639404, "reward_std": 0.3951677083969116, "rewards/classifier_reward": 0.8853127479553222, "rewards/length_reward": 0.4857142955064774, "rewards/slop_reward": 0.9785714268684387, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 288.2857299804688, "epoch": 0.3425, "grad_norm": 8.1455480169376, "kl": 1.215625, "learning_rate": 8.074588876943872e-07, "loss": 0.0012, "num_tokens": 3612481.0, "reward": 2.4660715579986574, "reward_std": 0.5484442114830017, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.5142857283353806, "rewards/slop_reward": 0.9517857074737549, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 267.8000061035156, "epoch": 0.345, "grad_norm": 5.930202214193798, "kl": 0.98125, "learning_rate": 8.040999634434882e-07, "loss": 0.0015, "num_tokens": 3639774.0, "reward": 2.7785715579986574, "reward_std": 0.39145426750183104, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.800000011920929, "rewards/slop_reward": 0.9785714268684387, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 255.57144165039062, "epoch": 0.3475, "grad_norm": 4.399939053025549, "kl": 1.53125, "learning_rate": 8.00719115684144e-07, "loss": 0.0025, "num_tokens": 3666639.0, "reward": 2.6642858505249025, "reward_std": 0.21827136874198913, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.6857142925262452, "rewards/slop_reward": 0.9785714268684387, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 251.571435546875, "epoch": 0.35, "grad_norm": 6.704733780428727, "kl": 1.0859375, "learning_rate": 7.973165881521433e-07, "loss": 0.002, "num_tokens": 3693159.0, "reward": 2.8857144355773925, "reward_std": 0.24877579212188722, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 1.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 248.9428680419922, "epoch": 0.3525, "grad_norm": 3.4261047442168397, "kl": 0.93359375, "learning_rate": 7.938926261462365e-07, "loss": 0.0028, "num_tokens": 3719792.0, "reward": 2.742554450035095, "reward_std": 0.16382334232330323, "rewards/classifier_reward": 0.8568399548530579, "rewards/length_reward": 0.8857142865657807, "rewards/slop_reward": 1.0, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 212.3714385986328, "epoch": 0.355, "grad_norm": 6.044238574227886, "kl": 1.3890625, "learning_rate": 7.90447476510452e-07, "loss": 0.0028, "num_tokens": 3745103.0, "reward": 2.9194665908813477, "reward_std": 0.21307192444801332, "rewards/classifier_reward": 0.9980378150939941, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 0.9785714268684387, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 184.5428680419922, "epoch": 0.3575, "grad_norm": 4.2944694392303155, "kl": 1.140625, "learning_rate": 7.869813876162998e-07, "loss": 0.003, "num_tokens": 3769090.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 207.1428680419922, "epoch": 0.36, "grad_norm": 2.4809967074334627, "kl": 1.1765625, "learning_rate": 7.834946093448658e-07, "loss": 0.0031, "num_tokens": 3794079.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 197.17143859863282, "epoch": 0.3625, "grad_norm": 3.755882982422092, "kl": 1.396875, "learning_rate": 7.799873930687977e-07, "loss": 0.0033, "num_tokens": 3818773.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 187.71429443359375, "epoch": 0.365, "grad_norm": 6.482025225350517, "kl": 2.0046875, "learning_rate": 7.764599916341816e-07, "loss": 0.003, "num_tokens": 3843103.0, "reward": 2.892857313156128, "reward_std": 0.22987756729125977, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9142857193946838, "rewards/slop_reward": 0.9785714268684387, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 214.8857208251953, "epoch": 0.3675, "grad_norm": 3.8265113738201806, "kl": 1.3890625, "learning_rate": 7.729126593423149e-07, "loss": 0.0033, "num_tokens": 3868513.0, "reward": 2.997367763519287, "reward_std": 0.006964774429798126, "rewards/classifier_reward": 0.9973675608634949, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 184.1428649902344, "epoch": 0.37, "grad_norm": 18.75869388269992, "kl": 1.515625, "learning_rate": 7.693456519313719e-07, "loss": 0.0029, "num_tokens": 3892878.0, "reward": 2.851199245452881, "reward_std": 0.18016420006752015, "rewards/classifier_reward": 0.9869133591651916, "rewards/length_reward": 0.8857142865657807, "rewards/slop_reward": 0.9785714268684387, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 193.6571502685547, "epoch": 0.3725, "grad_norm": 5.454996617575025, "kl": 1.684375, "learning_rate": 7.657592265579669e-07, "loss": 0.0031, "num_tokens": 3917511.0, "reward": 2.993258571624756, "reward_std": 0.01783668529242277, "rewards/classifier_reward": 0.9932583689689636, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 181.31429138183594, "epoch": 0.375, "grad_norm": 3.955698987070301, "kl": 1.3390625, "learning_rate": 7.621536417786158e-07, "loss": 0.0032, "num_tokens": 3941554.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 183.9714385986328, "epoch": 0.3775, "grad_norm": 4.5979098970153185, "kl": 1.528125, "learning_rate": 7.585291575310952e-07, "loss": 0.0034, "num_tokens": 3965818.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 175.71429443359375, "epoch": 0.38, "grad_norm": 7.501271856937242, "kl": 1.790625, "learning_rate": 7.548860351157027e-07, "loss": 0.0027, "num_tokens": 3989746.0, "reward": 2.7536909580230713, "reward_std": 0.3092236161231995, "rewards/classifier_reward": 0.9251193881034852, "rewards/length_reward": 0.8285714387893677, "rewards/slop_reward": 1.0, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 210.571435546875, "epoch": 0.3825, "grad_norm": 5.2866175605, "kl": 2.065625, "learning_rate": 7.512245371764196e-07, "loss": 0.0035, "num_tokens": 4015036.0, "reward": 2.9245490074157714, "reward_std": 0.12335940003395081, "rewards/classifier_reward": 0.9727631211280823, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9517857074737549, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 187.771435546875, "epoch": 0.385, "grad_norm": 4.602093297361531, "kl": 2.203125, "learning_rate": 7.475449276819752e-07, "loss": 0.0041, "num_tokens": 4039528.0, "reward": 2.942857360839844, "reward_std": 0.09759000539779664, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 175.60000610351562, "epoch": 0.3875, "grad_norm": 6.76968523287945, "kl": 1.71875, "learning_rate": 7.438474719068173e-07, "loss": 0.0031, "num_tokens": 4063594.0, "reward": 2.914285898208618, "reward_std": 0.1731828987598419, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9142857193946838, "rewards/slop_reward": 1.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 181.31429443359374, "epoch": 0.39, "grad_norm": 3.9415238872059795, "kl": 3.5171875, "learning_rate": 7.401324364119871e-07, "loss": 0.0054, "num_tokens": 4087555.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 199.22858276367188, "epoch": 0.3925, "grad_norm": 0.6743528375242909, "kl": 1.6703125, "learning_rate": 7.364000890259023e-07, "loss": 0.0041, "num_tokens": 4112265.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 203.02857971191406, "epoch": 0.395, "grad_norm": 0.21235184389038209, "kl": 1.5953125, "learning_rate": 7.326506988250487e-07, "loss": 0.004, "num_tokens": 4137291.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 211.00000915527343, "epoch": 0.3975, "grad_norm": 0.7871274406295002, "kl": 1.9828125, "learning_rate": 7.288845361145812e-07, "loss": 0.0044, "num_tokens": 4162596.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 205.82857971191407, "epoch": 0.4, "grad_norm": 0.3126714050105658, "kl": 1.596875, "learning_rate": 7.251018724088366e-07, "loss": 0.004, "num_tokens": 4187048.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 197.42857666015624, "epoch": 0.4025, "grad_norm": 0.3330658614195518, "kl": 1.5875, "learning_rate": 7.213029804117603e-07, "loss": 0.004, "num_tokens": 4211839.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 182.74286499023438, "epoch": 0.405, "grad_norm": 0.09053574072038517, "kl": 1.2125, "learning_rate": 7.174881339972448e-07, "loss": 0.0036, "num_tokens": 4236155.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 189.971435546875, "epoch": 0.4075, "grad_norm": 3.922781639240598, "kl": 1.1859375, "learning_rate": 7.136576081893863e-07, "loss": 0.0031, "num_tokens": 4260724.0, "reward": 2.968118953704834, "reward_std": 0.07463454008102417, "rewards/classifier_reward": 0.9966901540756226, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 211.22857971191405, "epoch": 0.41, "grad_norm": 0.5831119895182555, "kl": 1.22265625, "learning_rate": 7.09811679142657e-07, "loss": 0.0036, "num_tokens": 4285850.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 203.71429748535155, "epoch": 0.4125, "grad_norm": 2.3433145018681825, "kl": 1.1546875, "learning_rate": 7.059506241219964e-07, "loss": 0.0031, "num_tokens": 4310900.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 200.91429443359374, "epoch": 0.415, "grad_norm": 0.18814570159680577, "kl": 1.1671875, "learning_rate": 7.02074721482822e-07, "loss": 0.0036, "num_tokens": 4335852.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 212.08572692871093, "epoch": 0.4175, "grad_norm": 3.9253886023797246, "kl": 1.08125, "learning_rate": 6.981842506509626e-07, "loss": 0.0025, "num_tokens": 4361111.0, "reward": 2.950000190734863, "reward_std": 0.13228756189346313, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 202.82857971191407, "epoch": 0.42, "grad_norm": 118.43288868650292, "kl": 36.20390625, "learning_rate": 6.942794921025126e-07, "loss": 0.0382, "num_tokens": 4386130.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 194.0571533203125, "epoch": 0.4225, "grad_norm": 0.07954465329541446, "kl": 1.03984375, "learning_rate": 6.903607273436127e-07, "loss": 0.0034, "num_tokens": 4410840.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 193.7714385986328, "epoch": 0.425, "grad_norm": 0.0486408492542655, "kl": 0.978125, "learning_rate": 6.864282388901543e-07, "loss": 0.0034, "num_tokens": 4435370.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 208.08572692871093, "epoch": 0.4275, "grad_norm": 2.6728463524577752, "kl": 0.92265625, "learning_rate": 6.824823102474126e-07, "loss": 0.0028, "num_tokens": 4460308.0, "reward": 2.942857360839844, "reward_std": 0.09759000539779664, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 202.22858276367188, "epoch": 0.43, "grad_norm": 0.13449796435482986, "kl": 1.034375, "learning_rate": 6.785232258896076e-07, "loss": 0.0034, "num_tokens": 4485226.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 182.68572082519532, "epoch": 0.4325, "grad_norm": 7.189835012946043, "kl": 0.95078125, "learning_rate": 6.745512712393957e-07, "loss": 0.0024, "num_tokens": 4509446.0, "reward": 2.950000190734863, "reward_std": 0.13228756189346313, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 192.20000915527345, "epoch": 0.435, "grad_norm": 3.314581838568204, "kl": 0.9203125, "learning_rate": 6.705667326472924e-07, "loss": 0.0028, "num_tokens": 4533638.0, "reward": 2.992787170410156, "reward_std": 0.019083873927593233, "rewards/classifier_reward": 0.992786979675293, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 181.7714416503906, "epoch": 0.4375, "grad_norm": 4.868845585454781, "kl": 0.91015625, "learning_rate": 6.665698973710288e-07, "loss": 0.0023, "num_tokens": 4557920.0, "reward": 2.914285898208618, "reward_std": 0.1731828987598419, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9142857193946838, "rewards/slop_reward": 1.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 202.4857208251953, "epoch": 0.44, "grad_norm": 5.277192051669204, "kl": 0.9015625, "learning_rate": 6.625610535548417e-07, "loss": 0.0028, "num_tokens": 4582927.0, "reward": 2.971197080612183, "reward_std": 0.07549313902854919, "rewards/classifier_reward": 0.9997682809829712, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 191.3714385986328, "epoch": 0.4425, "grad_norm": 3.5895293665237213, "kl": 0.86953125, "learning_rate": 6.58540490208701e-07, "loss": 0.0028, "num_tokens": 4607545.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 204.74286804199218, "epoch": 0.445, "grad_norm": 3.6924099151163348, "kl": 0.88046875, "learning_rate": 6.545084971874736e-07, "loss": 0.0023, "num_tokens": 4632631.0, "reward": 2.9700303077697754, "reward_std": 0.07929292395710945, "rewards/classifier_reward": 0.9986015200614929, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 210.74286804199218, "epoch": 0.4475, "grad_norm": 4.558365673779022, "kl": 1.03125, "learning_rate": 6.504653651700277e-07, "loss": 0.0025, "num_tokens": 4657813.0, "reward": 2.950000190734863, "reward_std": 0.13228756189346313, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 209.17143859863282, "epoch": 0.45, "grad_norm": 0.04420464498654459, "kl": 0.86640625, "learning_rate": 6.464113856382751e-07, "loss": 0.0033, "num_tokens": 4683054.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 218.4857208251953, "epoch": 0.4525, "grad_norm": 2.557396683384517, "kl": 1.0640625, "learning_rate": 6.423468508561598e-07, "loss": 0.003, "num_tokens": 4708257.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 229.88572387695314, "epoch": 0.455, "grad_norm": 3.1120823972576854, "kl": 1.415625, "learning_rate": 6.382720538485855e-07, "loss": 0.0033, "num_tokens": 4734223.0, "reward": 2.942857360839844, "reward_std": 0.09759000539779664, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 226.40000915527344, "epoch": 0.4575, "grad_norm": 0.06880695435363977, "kl": 0.846875, "learning_rate": 6.341872883802922e-07, "loss": 0.0032, "num_tokens": 4759812.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 235.05715637207032, "epoch": 0.46, "grad_norm": 2.2167569033565937, "kl": 0.75546875, "learning_rate": 6.300928489346765e-07, "loss": 0.0027, "num_tokens": 4785935.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 227.91429443359374, "epoch": 0.4625, "grad_norm": 2.5442698264193897, "kl": 0.7765625, "learning_rate": 6.259890306925626e-07, "loss": 0.0027, "num_tokens": 4811832.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 221.2571533203125, "epoch": 0.465, "grad_norm": 4.529163471508088, "kl": 0.79765625, "learning_rate": 6.218761295109208e-07, "loss": 0.0018, "num_tokens": 4837398.0, "reward": 2.9214287281036375, "reward_std": 0.2078804552555084, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 0.9785714268684387, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 218.40000915527344, "epoch": 0.4675, "grad_norm": 0.04502419697939018, "kl": 0.8265625, "learning_rate": 6.177544419015387e-07, "loss": 0.0032, "num_tokens": 4862776.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 224.85715026855468, "epoch": 0.47, "grad_norm": 2.7870230542981322, "kl": 0.825, "learning_rate": 6.13624265009645e-07, "loss": 0.0027, "num_tokens": 4888566.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 228.0571502685547, "epoch": 0.4725, "grad_norm": 0.06424356320573535, "kl": 0.8890625, "learning_rate": 6.094858965924866e-07, "loss": 0.0033, "num_tokens": 4914374.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 224.4571533203125, "epoch": 0.475, "grad_norm": 4.417906183651794, "kl": 0.90703125, "learning_rate": 6.053396349978631e-07, "loss": 0.0023, "num_tokens": 4940085.0, "reward": 2.8559008598327638, "reward_std": 0.20502071976661682, "rewards/classifier_reward": 0.9344720721244812, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 0.9785714268684387, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 228.57143859863282, "epoch": 0.4775, "grad_norm": 0.7826472563807869, "kl": 0.95703125, "learning_rate": 6.011857791426178e-07, "loss": 0.0033, "num_tokens": 4966005.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 228.171435546875, "epoch": 0.48, "grad_norm": 0.2991397618360992, "kl": 1.06015625, "learning_rate": 5.970246284910876e-07, "loss": 0.0034, "num_tokens": 4991803.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 245.6285827636719, "epoch": 0.4825, "grad_norm": 4.590555458209977, "kl": 0.915625, "learning_rate": 5.92856483033514e-07, "loss": 0.0023, "num_tokens": 5018320.0, "reward": 2.950000190734863, "reward_std": 0.13228756189346313, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 231.00001220703126, "epoch": 0.485, "grad_norm": 3.8249513457346622, "kl": 0.83125, "learning_rate": 5.886816432644154e-07, "loss": 0.0023, "num_tokens": 5044075.0, "reward": 2.9297013759613035, "reward_std": 0.12975128293037413, "rewards/classifier_reward": 0.9797011494636536, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 232.00000915527343, "epoch": 0.4875, "grad_norm": 3.4135057358827834, "kl": 1.3140625, "learning_rate": 5.845004101609246e-07, "loss": 0.0032, "num_tokens": 5069796.0, "reward": 2.9581347465515138, "reward_std": 0.10335763692855834, "rewards/classifier_reward": 0.9867059469223023, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 236.68572692871095, "epoch": 0.49, "grad_norm": 4.013642506936478, "kl": 0.85390625, "learning_rate": 5.803130851610885e-07, "loss": 0.0023, "num_tokens": 5095958.0, "reward": 2.9428573131561278, "reward_std": 0.15118578672409058, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 238.4571533203125, "epoch": 0.4925, "grad_norm": 4.603644905524406, "kl": 0.940625, "learning_rate": 5.761199701421391e-07, "loss": 0.0019, "num_tokens": 5121931.0, "reward": 2.941946840286255, "reward_std": 0.15359463561326264, "rewards/classifier_reward": 0.9990895390510559, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 238.02857971191406, "epoch": 0.495, "grad_norm": 2.4430881235820507, "kl": 0.88046875, "learning_rate": 5.719213673987276e-07, "loss": 0.0028, "num_tokens": 5148140.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 228.7714385986328, "epoch": 0.4975, "grad_norm": 2.885447633946335, "kl": 0.8984375, "learning_rate": 5.677175796211332e-07, "loss": 0.0028, "num_tokens": 5173797.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 234.77144470214844, "epoch": 0.5, "grad_norm": 0.04186680424978939, "kl": 0.9296875, "learning_rate": 5.635089098734393e-07, "loss": 0.0033, "num_tokens": 5199798.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 224.51429443359376, "epoch": 0.5025, "grad_norm": 2.794225657350946, "kl": 0.95, "learning_rate": 5.592956615716866e-07, "loss": 0.0029, "num_tokens": 5225576.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 216.02857971191406, "epoch": 0.505, "grad_norm": 4.294508436959706, "kl": 1.10625, "learning_rate": 5.550781384619973e-07, "loss": 0.0025, "num_tokens": 5251038.0, "reward": 2.975967788696289, "reward_std": 0.0635837346315384, "rewards/classifier_reward": 0.9759676098823548, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 218.571435546875, "epoch": 0.5075, "grad_norm": 0.043507163632059365, "kl": 0.91484375, "learning_rate": 5.50856644598678e-07, "loss": 0.0033, "num_tokens": 5276396.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 205.6571502685547, "epoch": 0.51, "grad_norm": 0.09961196396275367, "kl": 1.0578125, "learning_rate": 5.466314843222993e-07, "loss": 0.0034, "num_tokens": 5301460.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 225.9714385986328, "epoch": 0.5125, "grad_norm": 3.605683444813084, "kl": 0.9609375, "learning_rate": 5.424029622377546e-07, "loss": 0.0029, "num_tokens": 5327289.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 215.80001220703124, "epoch": 0.515, "grad_norm": 0.19828091837058953, "kl": 1.23828125, "learning_rate": 5.381713831923007e-07, "loss": 0.0036, "num_tokens": 5352596.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 214.8571563720703, "epoch": 0.5175, "grad_norm": 2.9547607246974565, "kl": 1.12890625, "learning_rate": 5.339370522535804e-07, "loss": 0.003, "num_tokens": 5377938.0, "reward": 2.992868709564209, "reward_std": 0.018868234753608704, "rewards/classifier_reward": 0.9928684830665588, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 252.00001525878906, "epoch": 0.52, "grad_norm": 871.3268746641542, "kl": 64.790625, "learning_rate": 5.297002746876284e-07, "loss": 0.0667, "num_tokens": 5404678.0, "reward": 2.942857360839844, "reward_std": 0.09759000539779664, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 271.0857269287109, "epoch": 0.5225, "grad_norm": 13961.922953195786, "kl": 6094.1890625, "learning_rate": 5.254613559368648e-07, "loss": 6.1111, "num_tokens": 5432086.0, "reward": 2.8857144832611086, "reward_std": 0.18249738812446595, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 1.0, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 257.9714385986328, "epoch": 0.525, "grad_norm": 8.061516897407943, "kl": 1.028125, "learning_rate": 5.212206015980741e-07, "loss": 0.0025, "num_tokens": 5459016.0, "reward": 2.828571605682373, "reward_std": 0.1731828987598419, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8285714328289032, "rewards/slop_reward": 1.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 234.3714385986328, "epoch": 0.5275, "grad_norm": 0.05294394256827684, "kl": 1.0125, "learning_rate": 5.169783174003744e-07, "loss": 0.0034, "num_tokens": 5484886.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 244.20001220703125, "epoch": 0.53, "grad_norm": 2.7195604219419645, "kl": 1.353125, "learning_rate": 5.127348091831755e-07, "loss": 0.0033, "num_tokens": 5511353.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 247.80001220703124, "epoch": 0.5325, "grad_norm": 2.435046990313359, "kl": 0.98671875, "learning_rate": 5.084903828741312e-07, "loss": 0.0029, "num_tokens": 5537879.0, "reward": 2.828571653366089, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8285714298486709, "rewards/slop_reward": 1.0, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 233.82858276367188, "epoch": 0.535, "grad_norm": 2.848539339582319, "kl": 1.053125, "learning_rate": 5.042453444670828e-07, "loss": 0.003, "num_tokens": 5563937.0, "reward": 2.9500002384185793, "reward_std": 0.0866025447845459, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 243.6285827636719, "epoch": 0.5375, "grad_norm": 4.574390608580604, "kl": 0.91953125, "learning_rate": 5e-07, "loss": 0.0024, "num_tokens": 5590384.0, "reward": 2.8857144832611086, "reward_std": 0.19518001079559327, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 1.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 284.0571563720703, "epoch": 0.54, "grad_norm": 5.393964971542159, "kl": 1.16484375, "learning_rate": 4.957546555329173e-07, "loss": 0.0016, "num_tokens": 5618238.0, "reward": 2.6285715103149414, "reward_std": 0.25809029340744016, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.6285714328289032, "rewards/slop_reward": 1.0, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 244.40001525878907, "epoch": 0.5425, "grad_norm": 3.809477712898502, "kl": 0.7828125, "learning_rate": 4.915096171258689e-07, "loss": 0.0022, "num_tokens": 5644712.0, "reward": 2.828571605682373, "reward_std": 0.17318291068077088, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8285714328289032, "rewards/slop_reward": 1.0, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 227.60001220703126, "epoch": 0.545, "grad_norm": 0.7163836490261896, "kl": 1.0765625, "learning_rate": 4.872651908168244e-07, "loss": 0.0035, "num_tokens": 5670466.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 210.7714416503906, "epoch": 0.5475, "grad_norm": 0.06375443345853048, "kl": 0.86953125, "learning_rate": 4.830216825996256e-07, "loss": 0.0033, "num_tokens": 5695540.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 212.02858276367186, "epoch": 0.55, "grad_norm": 0.044596037608717574, "kl": 0.8765625, "learning_rate": 4.787793984019259e-07, "loss": 0.0033, "num_tokens": 5720881.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 200.80000915527344, "epoch": 0.5525, "grad_norm": 0.16974934473523362, "kl": 1.16640625, "learning_rate": 4.7453864406313536e-07, "loss": 0.0036, "num_tokens": 5745792.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 211.22857971191405, "epoch": 0.555, "grad_norm": 0.0818904708032968, "kl": 0.9921875, "learning_rate": 4.7029972531237154e-07, "loss": 0.0034, "num_tokens": 5770873.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 195.2571533203125, "epoch": 0.5575, "grad_norm": 0.05116820241495938, "kl": 0.9, "learning_rate": 4.6606294774641965e-07, "loss": 0.0033, "num_tokens": 5795571.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 190.57143859863282, "epoch": 0.56, "grad_norm": 3.069339312788395, "kl": 0.9046875, "learning_rate": 4.6182861680769923e-07, "loss": 0.0028, "num_tokens": 5819962.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 195.08572387695312, "epoch": 0.5625, "grad_norm": 2.051249437321434, "kl": 0.890625, "learning_rate": 4.5759703776224555e-07, "loss": 0.0028, "num_tokens": 5844710.0, "reward": 2.954781198501587, "reward_std": 0.0808977723121643, "rewards/classifier_reward": 0.9833523750305175, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 194.11429748535156, "epoch": 0.565, "grad_norm": 0.06947745807380493, "kl": 0.915625, "learning_rate": 4.5336851567770074e-07, "loss": 0.0033, "num_tokens": 5869322.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 196.68572082519532, "epoch": 0.5675, "grad_norm": 3.6616246314488534, "kl": 0.75234375, "learning_rate": 4.4914335540132204e-07, "loss": 0.0027, "num_tokens": 5893903.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 190.91429443359374, "epoch": 0.57, "grad_norm": 2.9722209992367854, "kl": 2.09140625, "learning_rate": 4.4492186153800284e-07, "loss": 0.004, "num_tokens": 5918505.0, "reward": 2.942857360839844, "reward_std": 0.09759000539779664, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 184.91429138183594, "epoch": 0.5725, "grad_norm": 0.06042948982632837, "kl": 0.8359375, "learning_rate": 4.407043384283136e-07, "loss": 0.0032, "num_tokens": 5942897.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 182.25715026855468, "epoch": 0.575, "grad_norm": 0.07756768631242807, "kl": 0.8734375, "learning_rate": 4.364910901265606e-07, "loss": 0.0033, "num_tokens": 5967196.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 173.02857971191406, "epoch": 0.5775, "grad_norm": 0.9892808129257086, "kl": 1.546875, "learning_rate": 4.3228242037886687e-07, "loss": 0.0039, "num_tokens": 5990830.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 162.94286499023437, "epoch": 0.58, "grad_norm": 0.0578879268472676, "kl": 0.97578125, "learning_rate": 4.280786326012723e-07, "loss": 0.0034, "num_tokens": 6014434.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 165.42857666015624, "epoch": 0.5825, "grad_norm": 3.3348841449857387, "kl": 1.1046875, "learning_rate": 4.23880029857861e-07, "loss": 0.003, "num_tokens": 6038089.0, "reward": 2.914285945892334, "reward_std": 0.10690449476242066, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9142857193946838, "rewards/slop_reward": 1.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 163.71429138183595, "epoch": 0.585, "grad_norm": 3.485368879973419, "kl": 1.05703125, "learning_rate": 4.1968691483891133e-07, "loss": 0.003, "num_tokens": 6061739.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 163.00000915527343, "epoch": 0.5875, "grad_norm": 5.36230481850721, "kl": 1.00703125, "learning_rate": 4.154995898390755e-07, "loss": 0.002, "num_tokens": 6085364.0, "reward": 2.9397803783416747, "reward_std": 0.15932661443948745, "rewards/classifier_reward": 0.9969230651855469, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 177.0571502685547, "epoch": 0.59, "grad_norm": 3.3342971599613382, "kl": 1.04140625, "learning_rate": 4.1131835673558456e-07, "loss": 0.003, "num_tokens": 6109257.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 191.42857971191407, "epoch": 0.5925, "grad_norm": 0.08376084072008337, "kl": 1.02109375, "learning_rate": 4.0714351696648606e-07, "loss": 0.0034, "num_tokens": 6133846.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 192.08572692871093, "epoch": 0.595, "grad_norm": 0.046627278932137715, "kl": 0.98515625, "learning_rate": 4.029753715089123e-07, "loss": 0.0034, "num_tokens": 6158489.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 198.08572387695312, "epoch": 0.5975, "grad_norm": 11.565806887201926, "kl": 13.7140625, "learning_rate": 3.988142208573822e-07, "loss": 0.0161, "num_tokens": 6183159.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 197.31429443359374, "epoch": 0.6, "grad_norm": 0.05311856298193549, "kl": 1.02265625, "learning_rate": 3.94660365002137e-07, "loss": 0.0034, "num_tokens": 6207985.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 202.25715026855468, "epoch": 0.6025, "grad_norm": 4.782436637657736, "kl": 1.04921875, "learning_rate": 3.9051410340751346e-07, "loss": 0.0025, "num_tokens": 6232984.0, "reward": 2.8857144832611086, "reward_std": 0.1824974000453949, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 1.0, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 193.6285827636719, "epoch": 0.605, "grad_norm": 3.232063907518827, "kl": 1.15859375, "learning_rate": 3.8637573499035503e-07, "loss": 0.0031, "num_tokens": 6257629.0, "reward": 2.879447841644287, "reward_std": 0.11382801532745361, "rewards/classifier_reward": 0.9937332987785339, "rewards/length_reward": 0.8857142865657807, "rewards/slop_reward": 1.0, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 195.4571502685547, "epoch": 0.6075, "grad_norm": 0.06471530202926354, "kl": 1.05625, "learning_rate": 3.822455580984613e-07, "loss": 0.0034, "num_tokens": 6282207.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 274.0571594238281, "epoch": 0.61, "grad_norm": 3.0795183504620107, "kl": 1.50390625, "learning_rate": 3.781238704890792e-07, "loss": 0.0034, "num_tokens": 6309000.0, "reward": 2.783582401275635, "reward_std": 0.023816290497779845, "rewards/classifier_reward": 0.9835822105407714, "rewards/length_reward": 0.8, "rewards/slop_reward": 1.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 210.28572692871094, "epoch": 0.6125, "grad_norm": 3.0694884073186777, "kl": 1.0015625, "learning_rate": 3.7401096930743746e-07, "loss": 0.0029, "num_tokens": 6334093.0, "reward": 2.9978450298309327, "reward_std": 0.005702095478773117, "rewards/classifier_reward": 0.9978448033332825, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 212.02857971191406, "epoch": 0.615, "grad_norm": 2.5150561307413764, "kl": 1.084375, "learning_rate": 3.699071510653235e-07, "loss": 0.003, "num_tokens": 6359434.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 220.0571533203125, "epoch": 0.6175, "grad_norm": 0.04173992686860701, "kl": 0.96875, "learning_rate": 3.6581271161970784e-07, "loss": 0.0034, "num_tokens": 6384975.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 221.91429443359374, "epoch": 0.62, "grad_norm": 0.11609659945057246, "kl": 1.05078125, "learning_rate": 3.6172794615141446e-07, "loss": 0.0034, "num_tokens": 6410642.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 229.8571533203125, "epoch": 0.6225, "grad_norm": 0.08717380831309861, "kl": 0.971875, "learning_rate": 3.5765314914384024e-07, "loss": 0.0034, "num_tokens": 6436607.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 235.22858276367188, "epoch": 0.625, "grad_norm": 0.06146613518625495, "kl": 0.99765625, "learning_rate": 3.535886143617248e-07, "loss": 0.0034, "num_tokens": 6462760.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 238.17144165039062, "epoch": 0.6275, "grad_norm": 3.705436703551293, "kl": 1.01953125, "learning_rate": 3.495346348299724e-07, "loss": 0.0025, "num_tokens": 6488563.0, "reward": 2.9139774799346925, "reward_std": 0.1073996058665216, "rewards/classifier_reward": 0.9996915578842163, "rewards/length_reward": 0.9142857193946838, "rewards/slop_reward": 1.0, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 240.9714385986328, "epoch": 0.63, "grad_norm": 0.04691998567992384, "kl": 0.97890625, "learning_rate": 3.454915028125263e-07, "loss": 0.0034, "num_tokens": 6514734.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 228.02857971191406, "epoch": 0.6325, "grad_norm": 2.988060627858761, "kl": 1.0296875, "learning_rate": 3.4145950979129914e-07, "loss": 0.0029, "num_tokens": 6540498.0, "reward": 2.99902081489563, "reward_std": 0.0025911811739206315, "rewards/classifier_reward": 0.9990206360816956, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 218.54286499023436, "epoch": 0.635, "grad_norm": 0.09098624238326748, "kl": 1.11640625, "learning_rate": 3.3743894644515824e-07, "loss": 0.0035, "num_tokens": 6565616.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 223.57143859863282, "epoch": 0.6375, "grad_norm": 0.07923298438659351, "kl": 0.95703125, "learning_rate": 3.334301026289712e-07, "loss": 0.0033, "num_tokens": 6591361.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 219.00000610351563, "epoch": 0.64, "grad_norm": 3.44595488232409, "kl": 1.071875, "learning_rate": 3.294332673527076e-07, "loss": 0.003, "num_tokens": 6616850.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 220.80000915527344, "epoch": 0.6425, "grad_norm": 0.0704184343563711, "kl": 1.046875, "learning_rate": 3.254487287606044e-07, "loss": 0.0034, "num_tokens": 6642498.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 235.11429748535156, "epoch": 0.645, "grad_norm": 4.4341498316331425, "kl": 0.9890625, "learning_rate": 3.214767741103923e-07, "loss": 0.0024, "num_tokens": 6668511.0, "reward": 2.950000190734863, "reward_std": 0.13228756189346313, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 227.11429443359376, "epoch": 0.6475, "grad_norm": 2.667970216501767, "kl": 1.121875, "learning_rate": 3.1751768975258743e-07, "loss": 0.003, "num_tokens": 6694380.0, "reward": 2.9997310638427734, "reward_std": 0.0007120789494365453, "rewards/classifier_reward": 0.9997308611869812, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 224.42858276367187, "epoch": 0.65, "grad_norm": 0.049266434577523735, "kl": 1.04921875, "learning_rate": 3.135717611098457e-07, "loss": 0.0034, "num_tokens": 6719910.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 220.97144165039063, "epoch": 0.6525, "grad_norm": 0.046460428834944396, "kl": 1.00546875, "learning_rate": 3.0963927265638734e-07, "loss": 0.0034, "num_tokens": 6745328.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 229.2571533203125, "epoch": 0.655, "grad_norm": 2.6813194889359324, "kl": 1.08125, "learning_rate": 3.0572050789748726e-07, "loss": 0.003, "num_tokens": 6771231.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 217.20001220703125, "epoch": 0.6575, "grad_norm": 2.8890479947848235, "kl": 1.16796875, "learning_rate": 3.018157493490374e-07, "loss": 0.0031, "num_tokens": 6796753.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 233.51430053710936, "epoch": 0.66, "grad_norm": 2.802128759307218, "kl": 1.371875, "learning_rate": 2.9792527851717803e-07, "loss": 0.0033, "num_tokens": 6822476.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 225.82858581542968, "epoch": 0.6625, "grad_norm": 0.06018626969663984, "kl": 1.04296875, "learning_rate": 2.940493758780037e-07, "loss": 0.0034, "num_tokens": 6847958.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 214.571435546875, "epoch": 0.665, "grad_norm": 57.99972496539001, "kl": 2.54609375, "learning_rate": 2.9018832085734295e-07, "loss": 0.0045, "num_tokens": 6873054.0, "reward": 2.8571430683135985, "reward_std": 0.09759000539779664, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8571428596973419, "rewards/slop_reward": 1.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 222.40001220703124, "epoch": 0.6675, "grad_norm": 5.391908835054465, "kl": 1.19921875, "learning_rate": 2.863423918106138e-07, "loss": 0.0022, "num_tokens": 6898757.0, "reward": 2.9923308849334718, "reward_std": 0.02029096046462655, "rewards/classifier_reward": 0.9923307299613953, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 226.80001220703124, "epoch": 0.67, "grad_norm": 0.44063981643559547, "kl": 1.27265625, "learning_rate": 2.825118660027553e-07, "loss": 0.0037, "num_tokens": 6924550.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 228.80001220703124, "epoch": 0.6725, "grad_norm": 3.710195212366325, "kl": 1.1890625, "learning_rate": 2.786970195882398e-07, "loss": 0.0026, "num_tokens": 6950478.0, "reward": 2.9779660224914553, "reward_std": 0.058296956680715085, "rewards/classifier_reward": 0.9993943929672241, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 227.51430053710936, "epoch": 0.675, "grad_norm": 99.1686641289829, "kl": 79.6328125, "learning_rate": 2.748981275911633e-07, "loss": 0.0819, "num_tokens": 6976266.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 242.82857971191407, "epoch": 0.6775, "grad_norm": 4.778004765539873, "kl": 1.34140625, "learning_rate": 2.7111546388541896e-07, "loss": 0.0028, "num_tokens": 7002514.0, "reward": 2.9389439105987547, "reward_std": 0.10570754185318947, "rewards/classifier_reward": 0.996086585521698, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 255.74286499023438, "epoch": 0.68, "grad_norm": 6.749318308021253, "kl": 1.54609375, "learning_rate": 2.673493011749513e-07, "loss": 0.003, "num_tokens": 7029221.0, "reward": 2.6523685693740844, "reward_std": 0.242851722240448, "rewards/classifier_reward": 0.9095112562179566, "rewards/length_reward": 0.7428571462631226, "rewards/slop_reward": 1.0, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 238.02857971191406, "epoch": 0.6825, "grad_norm": 4.816456021644019, "kl": 1.69375, "learning_rate": 2.635999109740976e-07, "loss": 0.0027, "num_tokens": 7055228.0, "reward": 2.858464765548706, "reward_std": 0.17900042831897736, "rewards/classifier_reward": 0.9941788673400879, "rewards/length_reward": 0.8857142865657807, "rewards/slop_reward": 0.9785714268684387, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 241.7714385986328, "epoch": 0.685, "grad_norm": 2.4842492940641647, "kl": 1.0171875, "learning_rate": 2.598675635880129e-07, "loss": 0.0029, "num_tokens": 7081610.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 228.571435546875, "epoch": 0.6875, "grad_norm": 2.556011928350466, "kl": 1.15546875, "learning_rate": 2.561525280931828e-07, "loss": 0.0031, "num_tokens": 7107408.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 231.6285827636719, "epoch": 0.69, "grad_norm": 0.08355624448786589, "kl": 1.053125, "learning_rate": 2.5245507231802486e-07, "loss": 0.0034, "num_tokens": 7133271.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 235.5428680419922, "epoch": 0.6925, "grad_norm": 3.2025322276348107, "kl": 1.4359375, "learning_rate": 2.487754628235805e-07, "loss": 0.0033, "num_tokens": 7159353.0, "reward": 2.9962107658386232, "reward_std": 0.01002594456076622, "rewards/classifier_reward": 0.9962105512619018, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 228.65715942382812, "epoch": 0.695, "grad_norm": 0.18581262102933002, "kl": 1.25859375, "learning_rate": 2.4511396488429724e-07, "loss": 0.0036, "num_tokens": 7185072.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 229.51429748535156, "epoch": 0.6975, "grad_norm": 4.453521381818444, "kl": 0.99765625, "learning_rate": 2.414708424689048e-07, "loss": 0.0024, "num_tokens": 7210683.0, "reward": 2.882569408416748, "reward_std": 0.2035010576248169, "rewards/classifier_reward": 0.9397120952606202, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 223.31429443359374, "epoch": 0.7, "grad_norm": 0.1121570381593698, "kl": 1.2296875, "learning_rate": 2.378463582213842e-07, "loss": 0.0036, "num_tokens": 7236399.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 217.60000915527343, "epoch": 0.7025, "grad_norm": 4.495034027794915, "kl": 0.9515625, "learning_rate": 2.3424077344203307e-07, "loss": 0.0024, "num_tokens": 7261935.0, "reward": 2.99290018081665, "reward_std": 0.018784815073013307, "rewards/classifier_reward": 0.9929000020027161, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 228.11429748535156, "epoch": 0.705, "grad_norm": 0.5186142582429601, "kl": 1.41875, "learning_rate": 2.3065434806862805e-07, "loss": 0.0038, "num_tokens": 7287768.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 228.11429443359376, "epoch": 0.7075, "grad_norm": 2.654334342756894, "kl": 1.0203125, "learning_rate": 2.2708734065768486e-07, "loss": 0.0029, "num_tokens": 7312659.0, "reward": 2.9993388175964357, "reward_std": 0.0017499331384897231, "rewards/classifier_reward": 0.9993385910987854, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 234.40001220703124, "epoch": 0.71, "grad_norm": 4.588949960274673, "kl": 1.12421875, "learning_rate": 2.2354000836581831e-07, "loss": 0.0021, "num_tokens": 7338617.0, "reward": 2.9408255100250242, "reward_std": 0.1565615115687251, "rewards/classifier_reward": 0.9908253073692321, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 231.6285827636719, "epoch": 0.7125, "grad_norm": 0.21793328593853228, "kl": 1.21484375, "learning_rate": 2.2001260693120232e-07, "loss": 0.0036, "num_tokens": 7364198.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 240.08572998046876, "epoch": 0.715, "grad_norm": 2.616590063333536, "kl": 1.01640625, "learning_rate": 2.1650539065513412e-07, "loss": 0.0029, "num_tokens": 7390479.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 241.97144165039063, "epoch": 0.7175, "grad_norm": 4.531348675554103, "kl": 1.19375, "learning_rate": 2.1301861238370016e-07, "loss": 0.0031, "num_tokens": 7416732.0, "reward": 2.942857360839844, "reward_std": 0.09759000539779664, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9428571462631226, "rewards/slop_reward": 1.0, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 246.20001220703125, "epoch": 0.72, "grad_norm": 4.718898706696491, "kl": 1.49921875, "learning_rate": 2.0955252348954805e-07, "loss": 0.0034, "num_tokens": 7443268.0, "reward": 2.6439733505249023, "reward_std": 0.043535226583480836, "rewards/classifier_reward": 0.8439731419086456, "rewards/length_reward": 0.8, "rewards/slop_reward": 1.0, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 224.48572387695313, "epoch": 0.7225, "grad_norm": 0.04294906761504897, "kl": 1.00546875, "learning_rate": 2.0610737385376348e-07, "loss": 0.0034, "num_tokens": 7468990.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 235.77144470214844, "epoch": 0.725, "grad_norm": 0.13090449233770926, "kl": 1.19453125, "learning_rate": 2.026834118478567e-07, "loss": 0.0036, "num_tokens": 7495162.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 222.57143859863282, "epoch": 0.7275, "grad_norm": 0.04429936575325668, "kl": 0.97734375, "learning_rate": 1.9928088431585589e-07, "loss": 0.0034, "num_tokens": 7520868.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 238.7428741455078, "epoch": 0.73, "grad_norm": 12.126930606187928, "kl": 10.5515625, "learning_rate": 1.959000365565119e-07, "loss": 0.0129, "num_tokens": 7546905.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 231.20001220703125, "epoch": 0.7325, "grad_norm": 1.1227825049824671, "kl": 1.8796875, "learning_rate": 1.925411123056128e-07, "loss": 0.0043, "num_tokens": 7572334.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 234.68572387695312, "epoch": 0.735, "grad_norm": 2.907894335014808, "kl": 1.3328125, "learning_rate": 1.8920435371841392e-07, "loss": 0.0032, "num_tokens": 7598444.0, "reward": 2.996587371826172, "reward_std": 0.009029625356197358, "rewards/classifier_reward": 0.9965871214866638, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 232.68572387695312, "epoch": 0.7375, "grad_norm": 0.10333954670768886, "kl": 1.1203125, "learning_rate": 1.858900013521788e-07, "loss": 0.0035, "num_tokens": 7624449.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 233.54286499023436, "epoch": 0.74, "grad_norm": 0.058548069484880644, "kl": 1.05859375, "learning_rate": 1.8259829414883725e-07, "loss": 0.0034, "num_tokens": 7650523.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 235.9428680419922, "epoch": 0.7425, "grad_norm": 3.1888591778113993, "kl": 1.009375, "learning_rate": 1.7932946941775878e-07, "loss": 0.0029, "num_tokens": 7676533.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 235.5428680419922, "epoch": 0.745, "grad_norm": 0.10062491596572344, "kl": 1.0046875, "learning_rate": 1.7608376281864502e-07, "loss": 0.0034, "num_tokens": 7702619.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 244.51430053710936, "epoch": 0.7475, "grad_norm": 0.061386216937859915, "kl": 1.02578125, "learning_rate": 1.7286140834453954e-07, "loss": 0.0034, "num_tokens": 7729097.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 254.4571533203125, "epoch": 0.75, "grad_norm": 0.05751560507113229, "kl": 1.03984375, "learning_rate": 1.6966263830495935e-07, "loss": 0.0034, "num_tokens": 7755641.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 240.34287109375, "epoch": 0.7525, "grad_norm": 0.11118968278237154, "kl": 1.11484375, "learning_rate": 1.6648768330914576e-07, "loss": 0.0035, "num_tokens": 7781895.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 241.40000915527344, "epoch": 0.755, "grad_norm": 0.191477055644317, "kl": 1.0296875, "learning_rate": 1.6333677224944037e-07, "loss": 0.0034, "num_tokens": 7808096.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 256.8571533203125, "epoch": 0.7575, "grad_norm": 2.9503904549206177, "kl": 0.984375, "learning_rate": 1.6021013228478275e-07, "loss": 0.0029, "num_tokens": 7835006.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 246.02857971191406, "epoch": 0.76, "grad_norm": 7.0345652170290975, "kl": 1.221875, "learning_rate": 1.5710798882433428e-07, "loss": 0.0036, "num_tokens": 7861536.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 253.68572387695312, "epoch": 0.7625, "grad_norm": 0.0476505587038927, "kl": 0.96328125, "learning_rate": 1.5403056551122694e-07, "loss": 0.0033, "num_tokens": 7888255.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 247.88572387695314, "epoch": 0.765, "grad_norm": 0.04204135010045963, "kl": 0.89609375, "learning_rate": 1.5097808420644115e-07, "loss": 0.0033, "num_tokens": 7914639.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 255.1428649902344, "epoch": 0.7675, "grad_norm": 0.07577358460759685, "kl": 0.9578125, "learning_rate": 1.479507649728105e-07, "loss": 0.0033, "num_tokens": 7941362.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 249.9714385986328, "epoch": 0.77, "grad_norm": 0.04375804621751843, "kl": 0.95703125, "learning_rate": 1.4494882605915714e-07, "loss": 0.0033, "num_tokens": 7967870.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 262.4571533203125, "epoch": 0.7725, "grad_norm": 3.881559830490846, "kl": 1.06953125, "learning_rate": 1.419724838845569e-07, "loss": 0.0025, "num_tokens": 7994976.0, "reward": 2.7243717670440675, "reward_std": 0.2547113478183746, "rewards/classifier_reward": 0.9529429793357849, "rewards/length_reward": 0.7714285761117935, "rewards/slop_reward": 1.0, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 256.0857208251953, "epoch": 0.775, "grad_norm": 0.042854049785843215, "kl": 0.934375, "learning_rate": 1.3902195302273778e-07, "loss": 0.0033, "num_tokens": 8021851.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 270.20001525878905, "epoch": 0.7775, "grad_norm": 5.120776391757703, "kl": 0.9703125, "learning_rate": 1.3609744618661013e-07, "loss": 0.0019, "num_tokens": 8049101.0, "reward": 2.8000001430511476, "reward_std": 0.28008740544319155, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8000000059604645, "rewards/slop_reward": 1.0, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 266.88572998046874, "epoch": 0.78, "grad_norm": 5.825629703539792, "kl": 1.01640625, "learning_rate": 1.331991742129318e-07, "loss": 0.0024, "num_tokens": 8076201.0, "reward": 2.828571605682373, "reward_std": 0.1731828987598419, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8285714328289032, "rewards/slop_reward": 1.0, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 255.80001220703124, "epoch": 0.7825, "grad_norm": 2.2750579624967204, "kl": 0.90625, "learning_rate": 1.3032734604710783e-07, "loss": 0.0028, "num_tokens": 8102845.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 258.34287109375, "epoch": 0.785, "grad_norm": 2.9657377944921426, "kl": 1.23046875, "learning_rate": 1.2748216872812745e-07, "loss": 0.0031, "num_tokens": 8129806.0, "reward": 2.828571653366089, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8285714298486709, "rewards/slop_reward": 1.0, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 241.60000915527343, "epoch": 0.7875, "grad_norm": 0.04358713190283898, "kl": 0.9265625, "learning_rate": 1.2466384737363779e-07, "loss": 0.0033, "num_tokens": 8156161.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 252.94286499023437, "epoch": 0.79, "grad_norm": 0.05992737661065536, "kl": 0.98046875, "learning_rate": 1.2187258516515642e-07, "loss": 0.0034, "num_tokens": 8182699.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 247.91429443359374, "epoch": 0.7925, "grad_norm": 0.07122711325744925, "kl": 0.9984375, "learning_rate": 1.1910858333342277e-07, "loss": 0.0034, "num_tokens": 8209296.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 250.11429748535156, "epoch": 0.795, "grad_norm": 3.4982043756301584, "kl": 0.98671875, "learning_rate": 1.1637204114389177e-07, "loss": 0.0029, "num_tokens": 8235818.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 251.08572387695312, "epoch": 0.7975, "grad_norm": 0.04511891765892185, "kl": 0.93671875, "learning_rate": 1.1366315588236741e-07, "loss": 0.0033, "num_tokens": 8262480.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 252.4571563720703, "epoch": 0.8, "grad_norm": 4.101920254520037, "kl": 0.98984375, "learning_rate": 1.1098212284078035e-07, "loss": 0.0024, "num_tokens": 8289236.0, "reward": 2.8857144832611086, "reward_std": 0.19518001079559327, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.8857142925262451, "rewards/slop_reward": 1.0, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 248.91429748535157, "epoch": 0.8025, "grad_norm": 0.05706184089424821, "kl": 0.975, "learning_rate": 1.0832913530310783e-07, "loss": 0.0034, "num_tokens": 8315716.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 250.31430358886718, "epoch": 0.805, "grad_norm": 0.07917877336969092, "kl": 1.02265625, "learning_rate": 1.0570438453144043e-07, "loss": 0.0034, "num_tokens": 8342093.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 246.60001220703126, "epoch": 0.8075, "grad_norm": 0.09776954891266137, "kl": 1.1421875, "learning_rate": 1.0310805975219255e-07, "loss": 0.0035, "num_tokens": 8368479.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 241.85715026855468, "epoch": 0.81, "grad_norm": 0.05012984603712916, "kl": 0.96015625, "learning_rate": 1.0054034814246093e-07, "loss": 0.0033, "num_tokens": 8394862.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 244.7714385986328, "epoch": 0.8125, "grad_norm": 0.2165819917911517, "kl": 1.35078125, "learning_rate": 9.800143481652979e-08, "loss": 0.0037, "num_tokens": 8421276.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 246.08572998046876, "epoch": 0.815, "grad_norm": 0.20309822089295643, "kl": 1.27265625, "learning_rate": 9.549150281252632e-08, "loss": 0.0037, "num_tokens": 8447767.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 243.6571533203125, "epoch": 0.8175, "grad_norm": 0.15578036977279336, "kl": 1.1515625, "learning_rate": 9.30107330792243e-08, "loss": 0.0035, "num_tokens": 8474150.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 237.4857208251953, "epoch": 0.82, "grad_norm": 0.043416576178703016, "kl": 0.9265625, "learning_rate": 9.055930446299914e-08, "loss": 0.0033, "num_tokens": 8500171.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 234.57143859863282, "epoch": 0.8225, "grad_norm": 0.05061080629519643, "kl": 1.034375, "learning_rate": 8.813739369493395e-08, "loss": 0.0034, "num_tokens": 8526178.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 243.28572692871094, "epoch": 0.825, "grad_norm": 0.04111158267935015, "kl": 0.94296875, "learning_rate": 8.574517537807896e-08, "loss": 0.0033, "num_tokens": 8552519.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 242.00000915527343, "epoch": 0.8275, "grad_norm": 0.3393008711951354, "kl": 1.38515625, "learning_rate": 8.338282197486362e-08, "loss": 0.0038, "num_tokens": 8578538.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 249.71430053710938, "epoch": 0.83, "grad_norm": 5.920098814946804, "kl": 1.471875, "learning_rate": 8.105050379466332e-08, "loss": 0.0034, "num_tokens": 8604935.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 249.34286499023438, "epoch": 0.8325, "grad_norm": 2.5733896209901923, "kl": 1.1671875, "learning_rate": 7.87483889815207e-08, "loss": 0.0031, "num_tokens": 8631504.0, "reward": 2.991787624359131, "reward_std": 0.02172858864068985, "rewards/classifier_reward": 0.9917873620986939, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 237.14287109375, "epoch": 0.835, "grad_norm": 2.715010668750814, "kl": 0.91875, "learning_rate": 7.64766435020246e-08, "loss": 0.0028, "num_tokens": 8657724.0, "reward": 2.989917850494385, "reward_std": 0.026675373315811157, "rewards/classifier_reward": 0.9899176597595215, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 247.05715637207032, "epoch": 0.8375, "grad_norm": 2.9706039903092303, "kl": 0.96171875, "learning_rate": 7.423543113334435e-08, "loss": 0.0029, "num_tokens": 8684291.0, "reward": 2.998081636428833, "reward_std": 0.005076154321432114, "rewards/classifier_reward": 0.9980813980102539, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 251.02857971191406, "epoch": 0.84, "grad_norm": 3.791827437250973, "kl": 0.978125, "learning_rate": 7.202491345142286e-08, "loss": 0.0029, "num_tokens": 8710997.0, "reward": 2.6675583362579345, "reward_std": 0.1484653353691101, "rewards/classifier_reward": 0.8389866888523102, "rewards/length_reward": 0.8285714298486709, "rewards/slop_reward": 1.0, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 241.48572692871093, "epoch": 0.8425, "grad_norm": 0.26642623759823414, "kl": 1.29609375, "learning_rate": 6.984524981932755e-08, "loss": 0.0037, "num_tokens": 8736817.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 247.2571533203125, "epoch": 0.845, "grad_norm": 0.05826256029366563, "kl": 0.9796875, "learning_rate": 6.769659737576227e-08, "loss": 0.0034, "num_tokens": 8763338.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 247.00001525878906, "epoch": 0.8475, "grad_norm": 0.0820068561247779, "kl": 0.91484375, "learning_rate": 6.557911102373809e-08, "loss": 0.0033, "num_tokens": 8789832.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 241.57143859863282, "epoch": 0.85, "grad_norm": 0.145851308869347, "kl": 1.04375, "learning_rate": 6.349294341940592e-08, "loss": 0.0034, "num_tokens": 8816056.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 249.6571533203125, "epoch": 0.8525, "grad_norm": 4.893160591979278, "kl": 1.1859375, "learning_rate": 6.143824496105121e-08, "loss": 0.0031, "num_tokens": 8842714.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 245.80001220703124, "epoch": 0.855, "grad_norm": 0.06508578960217941, "kl": 1.04375, "learning_rate": 5.941516377825101e-08, "loss": 0.0034, "num_tokens": 8869237.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 249.80000915527344, "epoch": 0.8575, "grad_norm": 4.184670329516298, "kl": 0.9828125, "learning_rate": 5.7423845721195184e-08, "loss": 0.0024, "num_tokens": 8895106.0, "reward": 2.950000190734863, "reward_std": 0.13228756189346313, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 0.9785714268684387, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 247.22858276367188, "epoch": 0.86, "grad_norm": 0.04621548010483809, "kl": 0.96796875, "learning_rate": 5.546443435017145e-08, "loss": 0.0034, "num_tokens": 8921496.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 243.71429443359375, "epoch": 0.8625, "grad_norm": 3.0589783495407725, "kl": 0.93515625, "learning_rate": 5.353707092521581e-08, "loss": 0.0028, "num_tokens": 8947905.0, "reward": 2.994158411026001, "reward_std": 0.01545594185590744, "rewards/classifier_reward": 0.9941581964492798, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 244.94286499023437, "epoch": 0.865, "grad_norm": 5.554678417760662, "kl": 3.01484375, "learning_rate": 5.16418943959282e-08, "loss": 0.0054, "num_tokens": 8974370.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 244.7714416503906, "epoch": 0.8675, "grad_norm": 0.0645376640043755, "kl": 0.915625, "learning_rate": 4.9779041391455775e-08, "loss": 0.0033, "num_tokens": 9000853.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 245.8571563720703, "epoch": 0.87, "grad_norm": 0.1578429426456495, "kl": 1.0453125, "learning_rate": 4.794864621064265e-08, "loss": 0.0034, "num_tokens": 9027326.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 243.51429748535156, "epoch": 0.8725, "grad_norm": 0.09635841303768418, "kl": 1.05234375, "learning_rate": 4.615084081234799e-08, "loss": 0.0034, "num_tokens": 9053438.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 238.80000915527344, "epoch": 0.875, "grad_norm": 9.025341176830713, "kl": 0.93828125, "learning_rate": 4.4385754805932095e-08, "loss": 0.0024, "num_tokens": 9079245.0, "reward": 2.9694100856781005, "reward_std": 0.0809337928891182, "rewards/classifier_reward": 0.9908384680747986, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 249.28572692871094, "epoch": 0.8775, "grad_norm": 3.7703349335107053, "kl": 1.09296875, "learning_rate": 4.2653515441913646e-08, "loss": 0.003, "num_tokens": 9105890.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 229.60000915527343, "epoch": 0.88, "grad_norm": 4.44121260340152, "kl": 0.9359375, "learning_rate": 4.095424760279453e-08, "loss": 0.0024, "num_tokens": 9131259.0, "reward": 2.8642487049102785, "reward_std": 0.13740314245224, "rewards/classifier_reward": 0.8642484605312347, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 238.171435546875, "epoch": 0.8825, "grad_norm": 4.315109786774216, "kl": 1.1671875, "learning_rate": 3.928807379405763e-08, "loss": 0.0026, "num_tokens": 9157515.0, "reward": 2.804906415939331, "reward_std": 0.016342369094491004, "rewards/classifier_reward": 0.8049062207341194, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 239.2571563720703, "epoch": 0.885, "grad_norm": 2.5953487008425773, "kl": 0.98125, "learning_rate": 3.7655114135334284e-08, "loss": 0.0029, "num_tokens": 9183753.0, "reward": 2.9993186473846434, "reward_std": 0.0018032947555184364, "rewards/classifier_reward": 0.9993184208869934, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 243.91429748535157, "epoch": 0.8875, "grad_norm": 3.061039239899815, "kl": 0.87109375, "learning_rate": 3.6055486351745324e-08, "loss": 0.0028, "num_tokens": 9210003.0, "reward": 2.9983937740325928, "reward_std": 0.004250280186533928, "rewards/classifier_reward": 0.9983935475349426, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 248.65715637207032, "epoch": 0.89, "grad_norm": 0.07280216811639449, "kl": 1.0234375, "learning_rate": 3.448930576541309e-08, "loss": 0.0034, "num_tokens": 9236471.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 244.571435546875, "epoch": 0.8925, "grad_norm": 0.04031184293990925, "kl": 0.85625, "learning_rate": 3.295668528714801e-08, "loss": 0.0032, "num_tokens": 9262896.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 247.0571533203125, "epoch": 0.895, "grad_norm": 0.048980473886171605, "kl": 0.90390625, "learning_rate": 3.145773540830815e-08, "loss": 0.0033, "num_tokens": 9289145.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 247.48572692871093, "epoch": 0.8975, "grad_norm": 0.06968257994497579, "kl": 0.9921875, "learning_rate": 2.9992564192834246e-08, "loss": 0.0034, "num_tokens": 9315727.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 245.34286804199218, "epoch": 0.9, "grad_norm": 0.05048579158042856, "kl": 0.965625, "learning_rate": 2.8561277269457895e-08, "loss": 0.0034, "num_tokens": 9342091.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 240.94287109375, "epoch": 0.9025, "grad_norm": 0.07338683112545501, "kl": 0.9578125, "learning_rate": 2.7163977824087692e-08, "loss": 0.0033, "num_tokens": 9368444.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 245.48572387695313, "epoch": 0.905, "grad_norm": 2.0759195339474985, "kl": 2.634375, "learning_rate": 2.5800766592369073e-08, "loss": 0.005, "num_tokens": 9394956.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 243.1428649902344, "epoch": 0.9075, "grad_norm": 0.071979853158223, "kl": 0.9921875, "learning_rate": 2.4471741852423233e-08, "loss": 0.0034, "num_tokens": 9421111.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 247.60001525878906, "epoch": 0.91, "grad_norm": 4.112340756036376, "kl": 1.06640625, "learning_rate": 2.3176999417760633e-08, "loss": 0.003, "num_tokens": 9447147.0, "reward": 2.997753620147705, "reward_std": 0.005943871289491654, "rewards/classifier_reward": 0.9977534294128418, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 247.65715637207032, "epoch": 0.9125, "grad_norm": 0.1514075169088936, "kl": 1.1046875, "learning_rate": 2.1916632630374577e-08, "loss": 0.0035, "num_tokens": 9473085.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 240.74286804199218, "epoch": 0.915, "grad_norm": 0.18737733575482768, "kl": 1.07578125, "learning_rate": 2.0690732354011088e-08, "loss": 0.0035, "num_tokens": 9499431.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 235.6571533203125, "epoch": 0.9175, "grad_norm": 0.04304702113082608, "kl": 0.92421875, "learning_rate": 1.9499386967619104e-08, "loss": 0.0033, "num_tokens": 9525433.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 234.6571502685547, "epoch": 0.92, "grad_norm": 2.545151028403328, "kl": 0.9734375, "learning_rate": 1.8342682358978068e-08, "loss": 0.0029, "num_tokens": 9551354.0, "reward": 2.9982373237609865, "reward_std": 0.004664153978228569, "rewards/classifier_reward": 0.9982371211051941, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 244.11429443359376, "epoch": 0.9225, "grad_norm": 0.04345300022978855, "kl": 0.9125, "learning_rate": 1.7220701918506662e-08, "loss": 0.0033, "num_tokens": 9577818.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 247.74287109375, "epoch": 0.925, "grad_norm": 3.5704547668159674, "kl": 1.284375, "learning_rate": 1.6133526533250563e-08, "loss": 0.0032, "num_tokens": 9604325.0, "reward": 2.9714287757873534, "reward_std": 0.07559289336204529, "rewards/classifier_reward": 1.0, "rewards/length_reward": 0.9714285731315613, "rewards/slop_reward": 1.0, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 237.11429748535156, "epoch": 0.9275, "grad_norm": 0.041173535714356384, "kl": 0.88828125, "learning_rate": 1.5081234581051482e-08, "loss": 0.0033, "num_tokens": 9630168.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 244.8571533203125, "epoch": 0.93, "grad_norm": 0.08243253183651265, "kl": 0.934375, "learning_rate": 1.4063901924895982e-08, "loss": 0.0033, "num_tokens": 9656156.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 237.1428649902344, "epoch": 0.9325, "grad_norm": 0.05082406017565479, "kl": 0.94921875, "learning_rate": 1.3081601907447004e-08, "loss": 0.0033, "num_tokens": 9682368.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 242.80001220703124, "epoch": 0.935, "grad_norm": 2.8407093679191733, "kl": 1.04453125, "learning_rate": 1.2134405345755772e-08, "loss": 0.003, "num_tokens": 9708489.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 248.08572692871093, "epoch": 0.9375, "grad_norm": 0.14511834175077776, "kl": 1.07421875, "learning_rate": 1.1222380526156927e-08, "loss": 0.0035, "num_tokens": 9734849.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 241.48572692871093, "epoch": 0.94, "grad_norm": 0.042968063146662946, "kl": 0.91875, "learning_rate": 1.034559319934497e-08, "loss": 0.0033, "num_tokens": 9761221.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 235.17144165039062, "epoch": 0.9425, "grad_norm": 3.168647847961566, "kl": 1.0890625, "learning_rate": 9.504106575634663e-09, "loss": 0.003, "num_tokens": 9787372.0, "reward": 2.965010404586792, "reward_std": 0.06012881994247436, "rewards/classifier_reward": 0.9864387512207031, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 237.91429748535157, "epoch": 0.945, "grad_norm": 0.04526250685823323, "kl": 0.99375, "learning_rate": 8.697981320403336e-09, "loss": 0.0034, "num_tokens": 9813619.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 246.20001220703125, "epoch": 0.9475, "grad_norm": 4.273461002843142, "kl": 0.9484375, "learning_rate": 7.927275549718226e-09, "loss": 0.0019, "num_tokens": 9839751.0, "reward": 2.9877804279327393, "reward_std": 0.03233038559556008, "rewards/classifier_reward": 0.9877802491188049, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 247.8571533203125, "epoch": 0.95, "grad_norm": 0.20002216578085852, "kl": 1.165625, "learning_rate": 7.1920448261457715e-09, "loss": 0.0035, "num_tokens": 9866244.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 244.60000915527343, "epoch": 0.9525, "grad_norm": 0.0834845425154776, "kl": 1.04375, "learning_rate": 6.492342154746588e-09, "loss": 0.0034, "num_tokens": 9892505.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 237.31429748535157, "epoch": 0.955, "grad_norm": 2.52753839477545, "kl": 1.11171875, "learning_rate": 5.828217979253869e-09, "loss": 0.003, "num_tokens": 9918307.0, "reward": 2.9943798542022706, "reward_std": 0.014869998395442962, "rewards/classifier_reward": 0.9943796753883362, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 235.571435546875, "epoch": 0.9575, "grad_norm": 3.128240693093393, "kl": 0.98359375, "learning_rate": 5.1997201784368395e-09, "loss": 0.0029, "num_tokens": 9944374.0, "reward": 2.995487594604492, "reward_std": 0.011939284205436707, "rewards/classifier_reward": 0.995487380027771, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 240.9428680419922, "epoch": 0.96, "grad_norm": 3.5538024950839295, "kl": 1.0, "learning_rate": 4.606894062648969e-09, "loss": 0.0024, "num_tokens": 9970727.0, "reward": 2.977288818359375, "reward_std": 0.06008877456188202, "rewards/classifier_reward": 0.9987171411514282, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 240.6571502685547, "epoch": 0.9625, "grad_norm": 0.1991534915494115, "kl": 1.184375, "learning_rate": 4.049782370561583e-09, "loss": 0.0036, "num_tokens": 9997070.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 235.34286804199218, "epoch": 0.965, "grad_norm": 0.05753950305238813, "kl": 0.99375, "learning_rate": 3.5284252660823244e-09, "loss": 0.0034, "num_tokens": 10023227.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 237.6571533203125, "epoch": 0.9675, "grad_norm": 0.09059748000607612, "kl": 1.003125, "learning_rate": 3.0428603354600844e-09, "loss": 0.0034, "num_tokens": 10049465.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 254.54286499023436, "epoch": 0.97, "grad_norm": 0.12475756025548783, "kl": 1.07890625, "learning_rate": 2.5931225845748917e-09, "loss": 0.0035, "num_tokens": 10076294.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 242.6571533203125, "epoch": 0.9725, "grad_norm": 0.4783333308376495, "kl": 1.3875, "learning_rate": 2.1792444364144847e-09, "loss": 0.0038, "num_tokens": 10102419.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 241.34286804199218, "epoch": 0.975, "grad_norm": 0.08716408529439204, "kl": 0.93125, "learning_rate": 1.8012557287367391e-09, "loss": 0.0033, "num_tokens": 10128786.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 241.00000915527343, "epoch": 0.9775, "grad_norm": 3.1645439252715826, "kl": 0.9140625, "learning_rate": 1.4591837119186102e-09, "loss": 0.0028, "num_tokens": 10155141.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 235.28572692871094, "epoch": 0.98, "grad_norm": 0.41518239333595675, "kl": 1.1234375, "learning_rate": 1.1530530469914256e-09, "loss": 0.0035, "num_tokens": 10181268.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 243.7714385986328, "epoch": 0.9825, "grad_norm": 0.040861855208901524, "kl": 0.878125, "learning_rate": 8.828858038632536e-10, "loss": 0.0033, "num_tokens": 10207349.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 249.9714385986328, "epoch": 0.985, "grad_norm": 0.12170276230850544, "kl": 1.1421875, "learning_rate": 6.48701459727563e-10, "loss": 0.0035, "num_tokens": 10233544.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 243.00001220703126, "epoch": 0.9875, "grad_norm": 0.04242721259201106, "kl": 0.93828125, "learning_rate": 4.5051689765929213e-10, "loss": 0.0033, "num_tokens": 10259969.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 248.68572387695312, "epoch": 0.99, "grad_norm": 2.430119389458536, "kl": 0.96875, "learning_rate": 2.883464053973772e-10, "loss": 0.0029, "num_tokens": 10286476.0, "reward": 2.9996359825134276, "reward_std": 0.0009637950919568538, "rewards/classifier_reward": 0.9996357202529907, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 244.34286499023438, "epoch": 0.9925, "grad_norm": 2.4946704769049948, "kl": 1.090625, "learning_rate": 1.6220167431502118e-10, "loss": 0.003, "num_tokens": 10312797.0, "reward": 2.998571014404297, "reward_std": 0.0037812769412994387, "rewards/classifier_reward": 0.9985708117485046, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 240.60001220703126, "epoch": 0.995, "grad_norm": 0.057929252615434516, "kl": 0.93359375, "learning_rate": 7.209179857675663e-11, "loss": 0.0033, "num_tokens": 10338970.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 238.6285827636719, "epoch": 0.9975, "grad_norm": 0.06529828900649211, "kl": 1.0125, "learning_rate": 1.8023274482636965e-11, "loss": 0.0034, "num_tokens": 10365242.0, "reward": 3.000000238418579, "reward_std": 0.0, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 1.0, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 239.8800018310547, "epoch": 1.0, "grad_norm": 2.484762868251253, "kl": 0.875, "learning_rate": 0.0, "loss": 0.0028, "num_tokens": 10384026.0, "reward": 2.978571653366089, "reward_std": 0.056694668531417844, "rewards/classifier_reward": 1.0, "rewards/length_reward": 1.0, "rewards/slop_reward": 0.9785714268684387, "step": 400 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }