|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1000.0000610351562, |
|
"epoch": 0.0025, |
|
"grad_norm": 0.2079561913831216, |
|
"kl": 0.0158782958984375, |
|
"learning_rate": 3.3333333333333334e-08, |
|
"loss": 0.0, |
|
"num_tokens": 52920.0, |
|
"reward": 2.0, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 179.22857971191405, |
|
"epoch": 0.005, |
|
"grad_norm": 12.249017739002824, |
|
"kl": 0.00041656494140625, |
|
"learning_rate": 6.666666666666667e-08, |
|
"loss": 0.0, |
|
"num_tokens": 77113.0, |
|
"reward": 1.8515485525131226, |
|
"reward_std": 0.6024735510349274, |
|
"rewards/classifier_reward": 0.37967347651720046, |
|
"rewards/length_reward": 0.7142857313156128, |
|
"rewards/slop_reward": 0.7575892865657806, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 133.4857208251953, |
|
"epoch": 0.0075, |
|
"grad_norm": 10089.085994752151, |
|
"kl": 0.001357269287109375, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0, |
|
"num_tokens": 99305.0, |
|
"reward": 1.5958443641662599, |
|
"reward_std": 0.44231254458427427, |
|
"rewards/classifier_reward": 0.4083442732691765, |
|
"rewards/length_reward": 0.3428571462631226, |
|
"rewards/slop_reward": 0.8446428537368774, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 215.08572387695312, |
|
"epoch": 0.01, |
|
"grad_norm": 2054.9417817778553, |
|
"kl": 0.0014644622802734374, |
|
"learning_rate": 1.3333333333333334e-07, |
|
"loss": 0.0, |
|
"num_tokens": 124753.0, |
|
"reward": 1.2784200072288514, |
|
"reward_std": 0.6290957629680634, |
|
"rewards/classifier_reward": 0.1494020951911807, |
|
"rewards/length_reward": 0.4571428656578064, |
|
"rewards/slop_reward": 0.6718749940395355, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 173.74286346435548, |
|
"epoch": 0.0125, |
|
"grad_norm": 2266.5233619504957, |
|
"kl": 0.001641082763671875, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": 0.0, |
|
"num_tokens": 148708.0, |
|
"reward": 1.3791944861412049, |
|
"reward_std": 0.43466432094573976, |
|
"rewards/classifier_reward": 0.17383726984262465, |
|
"rewards/length_reward": 0.5428571462631225, |
|
"rewards/slop_reward": 0.6625, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 202.02857971191406, |
|
"epoch": 0.015, |
|
"grad_norm": 11.643315762932316, |
|
"kl": 0.0005245208740234375, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0, |
|
"num_tokens": 173615.0, |
|
"reward": 2.2288811445236205, |
|
"reward_std": 0.4513823240995407, |
|
"rewards/classifier_reward": 0.4342381663620472, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 0.9089285731315613, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 213.74286499023438, |
|
"epoch": 0.0175, |
|
"grad_norm": 13.650814377669256, |
|
"kl": 0.000527191162109375, |
|
"learning_rate": 2.3333333333333333e-07, |
|
"loss": 0.0, |
|
"num_tokens": 199016.0, |
|
"reward": 1.9626029968261718, |
|
"reward_std": 0.3459654450416565, |
|
"rewards/classifier_reward": 0.39831727296113967, |
|
"rewards/length_reward": 0.7142857164144516, |
|
"rewards/slop_reward": 0.8499999940395355, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 288.71429443359375, |
|
"epoch": 0.02, |
|
"grad_norm": 10.936857808142296, |
|
"kl": 0.0004474639892578125, |
|
"learning_rate": 2.6666666666666667e-07, |
|
"loss": 0.0, |
|
"num_tokens": 227041.0, |
|
"reward": 1.8008938789367677, |
|
"reward_std": 0.3342688336968422, |
|
"rewards/classifier_reward": 0.4794651668518782, |
|
"rewards/length_reward": 0.4285714328289032, |
|
"rewards/slop_reward": 0.8928571343421936, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 135.14286193847656, |
|
"epoch": 0.0225, |
|
"grad_norm": 13.869838227975858, |
|
"kl": 0.000525665283203125, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 249691.0, |
|
"reward": 1.9354748964309691, |
|
"reward_std": 0.3361890375614166, |
|
"rewards/classifier_reward": 0.5604747980833054, |
|
"rewards/length_reward": 0.5142857193946838, |
|
"rewards/slop_reward": 0.8607142925262451, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 208.91429748535157, |
|
"epoch": 0.025, |
|
"grad_norm": 12.652344276830467, |
|
"kl": 0.0005481719970703125, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 0.0, |
|
"num_tokens": 274843.0, |
|
"reward": 1.943293523788452, |
|
"reward_std": 0.6581979870796204, |
|
"rewards/classifier_reward": 0.5843648463487625, |
|
"rewards/length_reward": 0.6000000178813935, |
|
"rewards/slop_reward": 0.7589285731315613, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 205.71429443359375, |
|
"epoch": 0.0275, |
|
"grad_norm": 15.071895621435214, |
|
"kl": 0.00064544677734375, |
|
"learning_rate": 3.666666666666666e-07, |
|
"loss": 0.0, |
|
"num_tokens": 299820.0, |
|
"reward": 1.5763698101043702, |
|
"reward_std": 0.4757842034101486, |
|
"rewards/classifier_reward": 0.431726861000061, |
|
"rewards/length_reward": 0.25714286863803865, |
|
"rewards/slop_reward": 0.8875, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 230.4285858154297, |
|
"epoch": 0.03, |
|
"grad_norm": 190.50276677565853, |
|
"kl": 0.014757537841796875, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0, |
|
"num_tokens": 325730.0, |
|
"reward": 2.0366717338562013, |
|
"reward_std": 0.44329026341438293, |
|
"rewards/classifier_reward": 0.3331002712249756, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 0.8178571462631226, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 205.62857971191406, |
|
"epoch": 0.0325, |
|
"grad_norm": 12.86535673011762, |
|
"kl": 0.0007328033447265625, |
|
"learning_rate": 4.3333333333333335e-07, |
|
"loss": 0.0, |
|
"num_tokens": 350634.0, |
|
"reward": 1.3644737243652343, |
|
"reward_std": 0.5067215681076049, |
|
"rewards/classifier_reward": 0.29661653861403464, |
|
"rewards/length_reward": 0.31428571939468386, |
|
"rewards/slop_reward": 0.7535714268684387, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 149.25715026855468, |
|
"epoch": 0.035, |
|
"grad_norm": 13.332730561008187, |
|
"kl": 0.00087738037109375, |
|
"learning_rate": 4.6666666666666666e-07, |
|
"loss": 0.0, |
|
"num_tokens": 373261.0, |
|
"reward": 1.4552783489227294, |
|
"reward_std": 0.6117016971111298, |
|
"rewards/classifier_reward": 0.18384971991181373, |
|
"rewards/length_reward": 0.40000001192092893, |
|
"rewards/slop_reward": 0.8714285612106323, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 213.71429901123048, |
|
"epoch": 0.0375, |
|
"grad_norm": 12.15928533039066, |
|
"kl": 0.001068878173828125, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"num_tokens": 398319.0, |
|
"reward": 1.7259351372718812, |
|
"reward_std": 0.5805591940879822, |
|
"rewards/classifier_reward": 0.5187922030687332, |
|
"rewards/length_reward": 0.48571428954601287, |
|
"rewards/slop_reward": 0.7214285731315613, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 200.94286499023437, |
|
"epoch": 0.04, |
|
"grad_norm": 210.10906643069993, |
|
"kl": 0.006036376953125, |
|
"learning_rate": 5.333333333333333e-07, |
|
"loss": 0.0, |
|
"num_tokens": 423267.0, |
|
"reward": 1.5157490253448487, |
|
"reward_std": 0.4267912685871124, |
|
"rewards/classifier_reward": 0.32646322101354597, |
|
"rewards/length_reward": 0.37142857611179353, |
|
"rewards/slop_reward": 0.8178571403026581, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 226.17143859863282, |
|
"epoch": 0.0425, |
|
"grad_norm": 10.900066010271587, |
|
"kl": 0.0018096923828125, |
|
"learning_rate": 5.666666666666666e-07, |
|
"loss": 0.0, |
|
"num_tokens": 449061.0, |
|
"reward": 1.5792103052139281, |
|
"reward_std": 0.5069970846176147, |
|
"rewards/classifier_reward": 0.13992448002099991, |
|
"rewards/length_reward": 0.6857142925262452, |
|
"rewards/slop_reward": 0.7535714209079742, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 167.08572387695312, |
|
"epoch": 0.045, |
|
"grad_norm": 265.89705750792126, |
|
"kl": 0.0086395263671875, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0, |
|
"num_tokens": 472825.0, |
|
"reward": 1.8174922943115235, |
|
"reward_std": 0.39020195603370667, |
|
"rewards/classifier_reward": 0.3174922451376915, |
|
"rewards/length_reward": 0.6285714387893677, |
|
"rewards/slop_reward": 0.8714285731315613, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 150.85714721679688, |
|
"epoch": 0.0475, |
|
"grad_norm": 16.75575953327707, |
|
"kl": 0.0071929931640625, |
|
"learning_rate": 6.333333333333332e-07, |
|
"loss": 0.0, |
|
"num_tokens": 496025.0, |
|
"reward": 1.9033495664596558, |
|
"reward_std": 0.46038708090782166, |
|
"rewards/classifier_reward": 0.5676352053880691, |
|
"rewards/length_reward": 0.4000000059604645, |
|
"rewards/slop_reward": 0.9357142806053161, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 170.20000610351562, |
|
"epoch": 0.05, |
|
"grad_norm": 12.507998039364391, |
|
"kl": 0.008447265625, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 0.0, |
|
"num_tokens": 519775.0, |
|
"reward": 1.8573363780975343, |
|
"reward_std": 0.46103876233100893, |
|
"rewards/classifier_reward": 0.3716219961643219, |
|
"rewards/length_reward": 0.6571428596973419, |
|
"rewards/slop_reward": 0.8285714149475097, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 254.57144317626953, |
|
"epoch": 0.0525, |
|
"grad_norm": 10.08679593336001, |
|
"kl": 0.005413818359375, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0, |
|
"num_tokens": 546566.0, |
|
"reward": 1.2423678815364838, |
|
"reward_std": 0.4237713754177094, |
|
"rewards/classifier_reward": 0.2472785457968712, |
|
"rewards/length_reward": 0.45714286267757415, |
|
"rewards/slop_reward": 0.5379464238882065, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 231.60000915527343, |
|
"epoch": 0.055, |
|
"grad_norm": 120.69825805503362, |
|
"kl": 0.01463623046875, |
|
"learning_rate": 7.333333333333332e-07, |
|
"loss": 0.0, |
|
"num_tokens": 572155.0, |
|
"reward": 1.6102877855300903, |
|
"reward_std": 0.47996904850006106, |
|
"rewards/classifier_reward": 0.39421619176864625, |
|
"rewards/length_reward": 0.45714286267757415, |
|
"rewards/slop_reward": 0.7589285731315613, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.971435546875, |
|
"epoch": 0.0575, |
|
"grad_norm": 10.506951478641199, |
|
"kl": 0.0114501953125, |
|
"learning_rate": 7.666666666666667e-07, |
|
"loss": 0.0, |
|
"num_tokens": 598457.0, |
|
"reward": 1.7265684366226197, |
|
"reward_std": 0.5010857343673706, |
|
"rewards/classifier_reward": 0.35513979494571685, |
|
"rewards/length_reward": 0.542857152223587, |
|
"rewards/slop_reward": 0.8285714268684388, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 212.17143859863282, |
|
"epoch": 0.06, |
|
"grad_norm": 14.496994349875411, |
|
"kl": 0.019873046875, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0, |
|
"num_tokens": 623795.0, |
|
"reward": 1.9756683349609374, |
|
"reward_std": 0.6098839461803436, |
|
"rewards/classifier_reward": 0.3970968216657639, |
|
"rewards/length_reward": 0.6857142925262452, |
|
"rewards/slop_reward": 0.8928571343421936, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 149.71429138183595, |
|
"epoch": 0.0625, |
|
"grad_norm": 13.955034723905843, |
|
"kl": 0.023046875, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.0, |
|
"num_tokens": 646708.0, |
|
"reward": 1.7968461036682128, |
|
"reward_std": 0.45076006054878237, |
|
"rewards/classifier_reward": 0.3789888650178909, |
|
"rewards/length_reward": 0.5142857193946838, |
|
"rewards/slop_reward": 0.9035714268684387, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 172.02857818603516, |
|
"epoch": 0.065, |
|
"grad_norm": 13.086801218372846, |
|
"kl": 0.03857421875, |
|
"learning_rate": 8.666666666666667e-07, |
|
"loss": 0.0, |
|
"num_tokens": 670238.0, |
|
"reward": 2.018597435951233, |
|
"reward_std": 0.5030077040195465, |
|
"rewards/classifier_reward": 0.44716874957084657, |
|
"rewards/length_reward": 0.6571428656578064, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 139.0571487426758, |
|
"epoch": 0.0675, |
|
"grad_norm": 13.874334958965811, |
|
"kl": 0.044140625, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0, |
|
"num_tokens": 693025.0, |
|
"reward": 1.3813447833061219, |
|
"reward_std": 0.4743997871875763, |
|
"rewards/classifier_reward": 0.3313447292894125, |
|
"rewards/length_reward": 0.28571428954601286, |
|
"rewards/slop_reward": 0.7642857193946838, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 312.0857269287109, |
|
"epoch": 0.07, |
|
"grad_norm": 9.049982563385965, |
|
"kl": 0.0622314453125, |
|
"learning_rate": 9.333333333333333e-07, |
|
"loss": 0.0001, |
|
"num_tokens": 721725.0, |
|
"reward": 1.688427746295929, |
|
"reward_std": 0.4802214980125427, |
|
"rewards/classifier_reward": 0.48485626801848414, |
|
"rewards/length_reward": 0.5142857193946838, |
|
"rewards/slop_reward": 0.6892857074737548, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 170.34286499023438, |
|
"epoch": 0.0725, |
|
"grad_norm": 13.386821366059916, |
|
"kl": 0.100341796875, |
|
"learning_rate": 9.666666666666666e-07, |
|
"loss": 0.0001, |
|
"num_tokens": 745420.0, |
|
"reward": 1.488853096961975, |
|
"reward_std": 0.4731449127197266, |
|
"rewards/classifier_reward": 0.2870673179626465, |
|
"rewards/length_reward": 0.31428571939468386, |
|
"rewards/slop_reward": 0.8875, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 250.0571502685547, |
|
"epoch": 0.075, |
|
"grad_norm": 11.769241786475325, |
|
"kl": 0.08681640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 771967.0, |
|
"reward": 1.6088370084762573, |
|
"reward_std": 0.5624471366405487, |
|
"rewards/classifier_reward": 0.2784797720611095, |
|
"rewards/length_reward": 0.4857142925262451, |
|
"rewards/slop_reward": 0.8446428537368774, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 203.6285858154297, |
|
"epoch": 0.0775, |
|
"grad_norm": 15.786130440761754, |
|
"kl": 0.120703125, |
|
"learning_rate": 9.999819767255174e-07, |
|
"loss": 0.0001, |
|
"num_tokens": 797014.0, |
|
"reward": 1.9446904182434082, |
|
"reward_std": 0.36648078858852384, |
|
"rewards/classifier_reward": 0.4661188304424286, |
|
"rewards/length_reward": 0.5428571462631225, |
|
"rewards/slop_reward": 0.9357142806053161, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 201.771435546875, |
|
"epoch": 0.08, |
|
"grad_norm": 15.903852139666391, |
|
"kl": 0.3494140625, |
|
"learning_rate": 9.999279082014231e-07, |
|
"loss": 0.0003, |
|
"num_tokens": 821995.0, |
|
"reward": 1.9787015676498414, |
|
"reward_std": 0.585156524181366, |
|
"rewards/classifier_reward": 0.5215587019920349, |
|
"rewards/length_reward": 0.5428571552038193, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.42858276367187, |
|
"epoch": 0.0825, |
|
"grad_norm": 29.64188533235313, |
|
"kl": 0.500390625, |
|
"learning_rate": 9.998377983256848e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 847628.0, |
|
"reward": 2.037756896018982, |
|
"reward_std": 0.5854993224143982, |
|
"rewards/classifier_reward": 0.45918539762496946, |
|
"rewards/length_reward": 0.6857142984867096, |
|
"rewards/slop_reward": 0.8928571343421936, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 213.91429443359374, |
|
"epoch": 0.085, |
|
"grad_norm": 13.883875104448363, |
|
"kl": 1.748828125, |
|
"learning_rate": 9.997116535946027e-07, |
|
"loss": 0.0018, |
|
"num_tokens": 872871.0, |
|
"reward": 1.7733258247375487, |
|
"reward_std": 0.5001925647258758, |
|
"rewards/classifier_reward": 0.4197543442249298, |
|
"rewards/length_reward": 0.6857142955064773, |
|
"rewards/slop_reward": 0.6678571343421936, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.771435546875, |
|
"epoch": 0.0875, |
|
"grad_norm": 14.166455798732835, |
|
"kl": 1.48515625, |
|
"learning_rate": 9.995494831023408e-07, |
|
"loss": 0.0015, |
|
"num_tokens": 898833.0, |
|
"reward": 2.2283714771270753, |
|
"reward_std": 0.4383322179317474, |
|
"rewards/classifier_reward": 0.6283714175224304, |
|
"rewards/length_reward": 0.6857142865657806, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.5428680419922, |
|
"epoch": 0.09, |
|
"grad_norm": 20.07276091542321, |
|
"kl": 1.05703125, |
|
"learning_rate": 9.993512985402724e-07, |
|
"loss": 0.0011, |
|
"num_tokens": 924948.0, |
|
"reward": 1.8037936687469482, |
|
"reward_std": 0.4002750262618065, |
|
"rewards/classifier_reward": 0.4395078897476196, |
|
"rewards/length_reward": 0.4285714328289032, |
|
"rewards/slop_reward": 0.9357142806053161, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.42858123779297, |
|
"epoch": 0.0925, |
|
"grad_norm": 16.81239416666408, |
|
"kl": 7.96875, |
|
"learning_rate": 9.991171141961368e-07, |
|
"loss": 0.0085, |
|
"num_tokens": 951092.0, |
|
"reward": 1.9397377490997314, |
|
"reward_std": 0.4508820950984955, |
|
"rewards/classifier_reward": 0.6325947523117066, |
|
"rewards/length_reward": 0.4571428656578064, |
|
"rewards/slop_reward": 0.8499999880790711, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 196.17143707275392, |
|
"epoch": 0.095, |
|
"grad_norm": 19.19953442548882, |
|
"kl": 3.775, |
|
"learning_rate": 9.988469469530085e-07, |
|
"loss": 0.0038, |
|
"num_tokens": 975652.0, |
|
"reward": 2.0388022661209106, |
|
"reward_std": 0.3248827219009399, |
|
"rewards/classifier_reward": 0.32451642835512756, |
|
"rewards/length_reward": 0.7142857193946839, |
|
"rewards/slop_reward": 1.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 352.22858276367185, |
|
"epoch": 0.0975, |
|
"grad_norm": 18.25441612913441, |
|
"kl": 0.79091796875, |
|
"learning_rate": 9.985408162880813e-07, |
|
"loss": 0.0008, |
|
"num_tokens": 1005900.0, |
|
"reward": 1.6981183767318726, |
|
"reward_std": 0.433578160405159, |
|
"rewards/classifier_reward": 0.5351718068122864, |
|
"rewards/length_reward": 0.40000000298023225, |
|
"rewards/slop_reward": 0.7629464268684387, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 321.02858276367186, |
|
"epoch": 0.1, |
|
"grad_norm": 10.54620387526757, |
|
"kl": 0.754296875, |
|
"learning_rate": 9.98198744271263e-07, |
|
"loss": 0.0008, |
|
"num_tokens": 1034877.0, |
|
"reward": 1.9676434993743896, |
|
"reward_std": 0.4024129122495651, |
|
"rewards/classifier_reward": 0.6390719175338745, |
|
"rewards/length_reward": 0.4571428656578064, |
|
"rewards/slop_reward": 0.8714285612106323, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 214.17143859863282, |
|
"epoch": 0.1025, |
|
"grad_norm": 13.824754472854506, |
|
"kl": 0.911328125, |
|
"learning_rate": 9.978207555635855e-07, |
|
"loss": 0.0009, |
|
"num_tokens": 1060157.0, |
|
"reward": 2.1357697248458862, |
|
"reward_std": 0.5395443201065063, |
|
"rewards/classifier_reward": 0.6357696294784546, |
|
"rewards/length_reward": 0.5428571552038193, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 223.6285827636719, |
|
"epoch": 0.105, |
|
"grad_norm": 18.073667394788664, |
|
"kl": 0.519921875, |
|
"learning_rate": 9.97406877415425e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1085893.0, |
|
"reward": 2.068192982673645, |
|
"reward_std": 0.4686335951089859, |
|
"rewards/classifier_reward": 0.4967642992734909, |
|
"rewards/length_reward": 0.6571428775787354, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 297.40001220703124, |
|
"epoch": 0.1075, |
|
"grad_norm": 10.8725727731831, |
|
"kl": 0.43515625, |
|
"learning_rate": 9.9695713966454e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1114056.0, |
|
"reward": 1.6727801322937013, |
|
"reward_std": 0.501282411813736, |
|
"rewards/classifier_reward": 0.501351535320282, |
|
"rewards/length_reward": 0.3428571492433548, |
|
"rewards/slop_reward": 0.8285714149475097, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 204.85715026855468, |
|
"epoch": 0.11, |
|
"grad_norm": 11.442387542173964, |
|
"kl": 0.574609375, |
|
"learning_rate": 9.964715747339175e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 1138804.0, |
|
"reward": 2.027732276916504, |
|
"reward_std": 0.6377828001976014, |
|
"rewards/classifier_reward": 0.6545179545879364, |
|
"rewards/length_reward": 0.48571430146694183, |
|
"rewards/slop_reward": 0.8875, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 290.6285888671875, |
|
"epoch": 0.1125, |
|
"grad_norm": 11.010885455058528, |
|
"kl": 0.451171875, |
|
"learning_rate": 9.959502176294382e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1166842.0, |
|
"reward": 1.8717997074127197, |
|
"reward_std": 0.45424606651067734, |
|
"rewards/classifier_reward": 0.45037112236022947, |
|
"rewards/length_reward": 0.4857142925262451, |
|
"rewards/slop_reward": 0.9357142806053161, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.60001220703126, |
|
"epoch": 0.115, |
|
"grad_norm": 10.74794734294439, |
|
"kl": 0.378125, |
|
"learning_rate": 9.953931059373511e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1193568.0, |
|
"reward": 2.1009025812149047, |
|
"reward_std": 0.5576321303844451, |
|
"rewards/classifier_reward": 0.6455453038215637, |
|
"rewards/length_reward": 0.6000000059604644, |
|
"rewards/slop_reward": 0.8553571462631225, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 205.51429595947266, |
|
"epoch": 0.1175, |
|
"grad_norm": 12.174860136090478, |
|
"kl": 0.580859375, |
|
"learning_rate": 9.948002798215631e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 1218520.0, |
|
"reward": 1.7478339910507201, |
|
"reward_std": 0.44800390899181364, |
|
"rewards/classifier_reward": 0.5246196419000626, |
|
"rewards/length_reward": 0.4000000059604645, |
|
"rewards/slop_reward": 0.8232142806053162, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 206.371435546875, |
|
"epoch": 0.12, |
|
"grad_norm": 10.734345225393307, |
|
"kl": 0.77734375, |
|
"learning_rate": 9.94171782020746e-07, |
|
"loss": 0.0008, |
|
"num_tokens": 1243553.0, |
|
"reward": 2.33663330078125, |
|
"reward_std": 0.5126991689205169, |
|
"rewards/classifier_reward": 0.6152046620845795, |
|
"rewards/length_reward": 0.7428571522235871, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 349.8000244140625, |
|
"epoch": 0.1225, |
|
"grad_norm": 9.516105487183257, |
|
"kl": 0.6921875, |
|
"learning_rate": 9.935076578452534e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 1273677.0, |
|
"reward": 1.6252358913421632, |
|
"reward_std": 0.4947394013404846, |
|
"rewards/classifier_reward": 0.5180929381400347, |
|
"rewards/length_reward": 0.25714286267757414, |
|
"rewards/slop_reward": 0.85, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.57143859863282, |
|
"epoch": 0.125, |
|
"grad_norm": 11.582425399894849, |
|
"kl": 1.07734375, |
|
"learning_rate": 9.928079551738541e-07, |
|
"loss": 0.0011, |
|
"num_tokens": 1299334.0, |
|
"reward": 2.2410747528076174, |
|
"reward_std": 0.5065959393978119, |
|
"rewards/classifier_reward": 0.5834853827953339, |
|
"rewards/length_reward": 0.6857143044471741, |
|
"rewards/slop_reward": 0.971875, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 254.68572692871095, |
|
"epoch": 0.1275, |
|
"grad_norm": 9.691651370929392, |
|
"kl": 0.7765625, |
|
"learning_rate": 9.920727244502818e-07, |
|
"loss": 0.0008, |
|
"num_tokens": 1326112.0, |
|
"reward": 1.8951802968978881, |
|
"reward_std": 0.561316728591919, |
|
"rewards/classifier_reward": 0.4666088119149208, |
|
"rewards/length_reward": 0.5142857283353806, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 245.82858581542968, |
|
"epoch": 0.13, |
|
"grad_norm": 9.231142781375425, |
|
"kl": 0.4640625, |
|
"learning_rate": 9.913020186795966e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1352635.0, |
|
"reward": 2.4359071254730225, |
|
"reward_std": 0.504255086183548, |
|
"rewards/classifier_reward": 0.8644783020019531, |
|
"rewards/length_reward": 0.6571428656578064, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 336.2857360839844, |
|
"epoch": 0.1325, |
|
"grad_norm": 8.070502355443343, |
|
"kl": 1.01142578125, |
|
"learning_rate": 9.904958934243653e-07, |
|
"loss": 0.001, |
|
"num_tokens": 1382325.0, |
|
"reward": 1.9690003395080566, |
|
"reward_std": 0.4140917003154755, |
|
"rewards/classifier_reward": 0.595339572429657, |
|
"rewards/length_reward": 0.5142857253551483, |
|
"rewards/slop_reward": 0.859375, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 176.85714721679688, |
|
"epoch": 0.135, |
|
"grad_norm": 13.273584784804553, |
|
"kl": 0.737109375, |
|
"learning_rate": 9.89654406800655e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 1406364.0, |
|
"reward": 2.015079474449158, |
|
"reward_std": 0.34893424808979034, |
|
"rewards/classifier_reward": 0.6650793373584747, |
|
"rewards/length_reward": 0.37142857611179353, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 270.77144317626954, |
|
"epoch": 0.1375, |
|
"grad_norm": 10.259811273773323, |
|
"kl": 0.41875, |
|
"learning_rate": 9.887776194738431e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1433517.0, |
|
"reward": 2.0523552179336546, |
|
"reward_std": 0.43476098477840425, |
|
"rewards/classifier_reward": 0.72378368973732, |
|
"rewards/length_reward": 0.37142857909202576, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 270.20001220703125, |
|
"epoch": 0.14, |
|
"grad_norm": 98.33006992571028, |
|
"kl": 25.924609375, |
|
"learning_rate": 9.878655946542442e-07, |
|
"loss": 0.0258, |
|
"num_tokens": 1460894.0, |
|
"reward": 2.2650604486465453, |
|
"reward_std": 0.5314578056335449, |
|
"rewards/classifier_reward": 0.7382745862007141, |
|
"rewards/length_reward": 0.6285714358091354, |
|
"rewards/slop_reward": 0.8982142806053162, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 213.82858581542968, |
|
"epoch": 0.1425, |
|
"grad_norm": 11.336151350522224, |
|
"kl": 0.592578125, |
|
"learning_rate": 9.86918398092553e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 1486239.0, |
|
"reward": 2.248945116996765, |
|
"reward_std": 0.4141096830368042, |
|
"rewards/classifier_reward": 0.6918022215366364, |
|
"rewards/length_reward": 0.6000000059604644, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 301.85715637207034, |
|
"epoch": 0.145, |
|
"grad_norm": 9.956188328948622, |
|
"kl": 0.44765625, |
|
"learning_rate": 9.85936098075104e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1514661.0, |
|
"reward": 1.9887679100036622, |
|
"reward_std": 0.47130251824855807, |
|
"rewards/classifier_reward": 0.6173391878604889, |
|
"rewards/length_reward": 0.45714286267757415, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 194.42857971191407, |
|
"epoch": 0.1475, |
|
"grad_norm": 10.413710834913637, |
|
"kl": 0.428125, |
|
"learning_rate": 9.849187654189485e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1539249.0, |
|
"reward": 2.1201124668121336, |
|
"reward_std": 0.4662812829017639, |
|
"rewards/classifier_reward": 0.8486838459968566, |
|
"rewards/length_reward": 0.3142857253551483, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 223.71429443359375, |
|
"epoch": 0.15, |
|
"grad_norm": 9.48253169035506, |
|
"kl": 0.6697265625, |
|
"learning_rate": 9.838664734667495e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 1564932.0, |
|
"reward": 2.3097215414047243, |
|
"reward_std": 0.37846060991287234, |
|
"rewards/classifier_reward": 0.6097213685512543, |
|
"rewards/length_reward": 0.7428571492433548, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 213.08572235107422, |
|
"epoch": 0.1525, |
|
"grad_norm": 17.056291850314448, |
|
"kl": 0.484375, |
|
"learning_rate": 9.827792980814933e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1590245.0, |
|
"reward": 2.139114594459534, |
|
"reward_std": 0.3371311604976654, |
|
"rewards/classifier_reward": 0.6605431139469147, |
|
"rewards/length_reward": 0.5428571492433548, |
|
"rewards/slop_reward": 0.9357142806053161, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.68572540283202, |
|
"epoch": 0.155, |
|
"grad_norm": 9.724576982946711, |
|
"kl": 0.580859375, |
|
"learning_rate": 9.81657317641022e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 1615704.0, |
|
"reward": 2.022816562652588, |
|
"reward_std": 0.4144854575395584, |
|
"rewards/classifier_reward": 0.5585307866334915, |
|
"rewards/length_reward": 0.4857142955064774, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 343.8285888671875, |
|
"epoch": 0.1575, |
|
"grad_norm": 7.143874630694377, |
|
"kl": 0.4015625, |
|
"learning_rate": 9.805006130323808e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1645654.0, |
|
"reward": 2.0719400882720946, |
|
"reward_std": 0.3876799166202545, |
|
"rewards/classifier_reward": 0.6647971898317337, |
|
"rewards/length_reward": 0.42857143878936765, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 266.00001220703126, |
|
"epoch": 0.16, |
|
"grad_norm": 9.518081765887688, |
|
"kl": 0.3986328125, |
|
"learning_rate": 9.793092676459888e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1672883.0, |
|
"reward": 2.084528160095215, |
|
"reward_std": 0.473650124669075, |
|
"rewards/classifier_reward": 0.584528061747551, |
|
"rewards/length_reward": 0.5428571552038193, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 279.71429748535155, |
|
"epoch": 0.1625, |
|
"grad_norm": 8.015308302511995, |
|
"kl": 0.54375, |
|
"learning_rate": 9.780833673696254e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1700438.0, |
|
"reward": 2.3380573272705076, |
|
"reward_std": 0.4121220216155052, |
|
"rewards/classifier_reward": 0.8237714767456055, |
|
"rewards/length_reward": 0.5142857223749161, |
|
"rewards/slop_reward": 1.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 248.31430053710938, |
|
"epoch": 0.165, |
|
"grad_norm": 8.749621821849056, |
|
"kl": 0.421484375, |
|
"learning_rate": 9.768230005822393e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1727047.0, |
|
"reward": 2.2453027963638306, |
|
"reward_std": 0.38129588067531583, |
|
"rewards/classifier_reward": 0.781016880273819, |
|
"rewards/length_reward": 0.48571428954601287, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 283.3142974853516, |
|
"epoch": 0.1675, |
|
"grad_norm": 15.835663084851635, |
|
"kl": 0.4734375, |
|
"learning_rate": 9.755282581475767e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1754863.0, |
|
"reward": 2.350207304954529, |
|
"reward_std": 0.5121555209159852, |
|
"rewards/classifier_reward": 0.7930643558502197, |
|
"rewards/length_reward": 0.6000000149011612, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 207.0571502685547, |
|
"epoch": 0.17, |
|
"grad_norm": 10.560189239731793, |
|
"kl": 0.418359375, |
|
"learning_rate": 9.741992334076308e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 1780017.0, |
|
"reward": 2.56486029624939, |
|
"reward_std": 0.4183764517307281, |
|
"rewards/classifier_reward": 0.7505744874477387, |
|
"rewards/length_reward": 0.8571428656578064, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 207.31429443359374, |
|
"epoch": 0.1725, |
|
"grad_norm": 12.483812322806989, |
|
"kl": 0.690234375, |
|
"learning_rate": 9.728360221759123e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 1805172.0, |
|
"reward": 2.421455478668213, |
|
"reward_std": 0.5775963604450226, |
|
"rewards/classifier_reward": 0.7643125176429748, |
|
"rewards/length_reward": 0.7428571522235871, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 262.8285858154297, |
|
"epoch": 0.175, |
|
"grad_norm": 10.184314056992156, |
|
"kl": 0.665234375, |
|
"learning_rate": 9.71438722730542e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 1832291.0, |
|
"reward": 2.078964352607727, |
|
"reward_std": 0.5653822362422943, |
|
"rewards/classifier_reward": 0.6146785497665406, |
|
"rewards/length_reward": 0.4857142984867096, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 338.9428741455078, |
|
"epoch": 0.1775, |
|
"grad_norm": 8.806960986372623, |
|
"kl": 0.46015625, |
|
"learning_rate": 9.700074358071656e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1861997.0, |
|
"reward": 2.0550333499908446, |
|
"reward_std": 0.5070074677467347, |
|
"rewards/classifier_reward": 0.7263502657413483, |
|
"rewards/length_reward": 0.40000001192092893, |
|
"rewards/slop_reward": 0.9286830306053162, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 270.0857299804687, |
|
"epoch": 0.18, |
|
"grad_norm": 9.059350132300025, |
|
"kl": 0.450390625, |
|
"learning_rate": 9.685422645916918e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1888854.0, |
|
"reward": 2.531459331512451, |
|
"reward_std": 0.3892214775085449, |
|
"rewards/classifier_reward": 0.860030734539032, |
|
"rewards/length_reward": 0.7142857193946839, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 291.4285827636719, |
|
"epoch": 0.1825, |
|
"grad_norm": 12.813964228104469, |
|
"kl": 0.52265625, |
|
"learning_rate": 9.670433147128521e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1916974.0, |
|
"reward": 2.2648436784744264, |
|
"reward_std": 0.40854659080505373, |
|
"rewards/classifier_reward": 0.8577007412910461, |
|
"rewards/length_reward": 0.42857143878936765, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 246.31429748535157, |
|
"epoch": 0.185, |
|
"grad_norm": 241.73474838722862, |
|
"kl": 0.453125, |
|
"learning_rate": 9.655106942345868e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1943440.0, |
|
"reward": 2.366797590255737, |
|
"reward_std": 0.41730722188949587, |
|
"rewards/classifier_reward": 0.766797399520874, |
|
"rewards/length_reward": 0.600000011920929, |
|
"rewards/slop_reward": 1.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 250.91429443359374, |
|
"epoch": 0.1875, |
|
"grad_norm": 8.395335843546244, |
|
"kl": 0.5015625, |
|
"learning_rate": 9.639445136482546e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1969900.0, |
|
"reward": 2.4618414878845214, |
|
"reward_std": 0.540682977437973, |
|
"rewards/classifier_reward": 0.8261271595954895, |
|
"rewards/length_reward": 0.6571428716182709, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.11429748535156, |
|
"epoch": 0.19, |
|
"grad_norm": 9.31969072416839, |
|
"kl": 0.53671875, |
|
"learning_rate": 9.623448858646656e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 1996327.0, |
|
"reward": 2.259377145767212, |
|
"reward_std": 0.4671668648719788, |
|
"rewards/classifier_reward": 0.916519820690155, |
|
"rewards/length_reward": 0.3428571581840515, |
|
"rewards/slop_reward": 1.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 316.9714416503906, |
|
"epoch": 0.1925, |
|
"grad_norm": 7.365001527017238, |
|
"kl": 1.408984375, |
|
"learning_rate": 9.607119262059425e-07, |
|
"loss": 0.0014, |
|
"num_tokens": 2024968.0, |
|
"reward": 2.266000509262085, |
|
"reward_std": 0.5217303335666656, |
|
"rewards/classifier_reward": 0.8231433391571045, |
|
"rewards/length_reward": 0.48571430146694183, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.62857971191406, |
|
"epoch": 0.195, |
|
"grad_norm": 45.20061919834423, |
|
"kl": 23.44453125, |
|
"learning_rate": 9.590457523972055e-07, |
|
"loss": 0.0236, |
|
"num_tokens": 2050735.0, |
|
"reward": 2.3131242275238035, |
|
"reward_std": 0.5057340741157532, |
|
"rewards/classifier_reward": 0.8827670216560364, |
|
"rewards/length_reward": 0.4571428656578064, |
|
"rewards/slop_reward": 0.9732142806053161, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 217.91429443359374, |
|
"epoch": 0.1975, |
|
"grad_norm": 9.435990839066255, |
|
"kl": 0.4484375, |
|
"learning_rate": 9.573464845580863e-07, |
|
"loss": 0.0004, |
|
"num_tokens": 2076160.0, |
|
"reward": 2.3861000537872314, |
|
"reward_std": 0.6137513637542724, |
|
"rewards/classifier_reward": 0.8432427525520325, |
|
"rewards/length_reward": 0.5428571581840516, |
|
"rewards/slop_reward": 1.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 250.91429443359374, |
|
"epoch": 0.2, |
|
"grad_norm": 7.98212579749696, |
|
"kl": 0.5078125, |
|
"learning_rate": 9.556142451940679e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 2102862.0, |
|
"reward": 2.2402546644210815, |
|
"reward_std": 0.602302199602127, |
|
"rewards/classifier_reward": 0.7473974108695984, |
|
"rewards/length_reward": 0.600000011920929, |
|
"rewards/slop_reward": 0.8928571343421936, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 287.34287414550784, |
|
"epoch": 0.2025, |
|
"grad_norm": 8.04736588295688, |
|
"kl": 0.61015625, |
|
"learning_rate": 9.53849159187652e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2130556.0, |
|
"reward": 2.3916960954666138, |
|
"reward_std": 0.45145381689071656, |
|
"rewards/classifier_reward": 0.9631245970726013, |
|
"rewards/length_reward": 0.5142857253551483, |
|
"rewards/slop_reward": 0.9142857074737549, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 264.02857666015626, |
|
"epoch": 0.205, |
|
"grad_norm": 8.203747363662693, |
|
"kl": 0.523828125, |
|
"learning_rate": 9.520513537893573e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 2157656.0, |
|
"reward": 2.4363236665725707, |
|
"reward_std": 0.32723597437143326, |
|
"rewards/classifier_reward": 0.7720378637313843, |
|
"rewards/length_reward": 0.6857142925262452, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.02858276367186, |
|
"epoch": 0.2075, |
|
"grad_norm": 7.89698513820585, |
|
"kl": 0.556640625, |
|
"learning_rate": 9.502209586085442e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2183592.0, |
|
"reward": 2.6267528533935547, |
|
"reward_std": 0.41746888160705564, |
|
"rewards/classifier_reward": 0.9338955879211426, |
|
"rewards/length_reward": 0.7142857253551483, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.37144165039064, |
|
"epoch": 0.21, |
|
"grad_norm": 8.428058041901203, |
|
"kl": 0.534765625, |
|
"learning_rate": 9.483581056040718e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 2209201.0, |
|
"reward": 2.4542306900024413, |
|
"reward_std": 0.4101273000240326, |
|
"rewards/classifier_reward": 0.9113734126091003, |
|
"rewards/length_reward": 0.5428571552038193, |
|
"rewards/slop_reward": 1.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 257.71429443359375, |
|
"epoch": 0.2125, |
|
"grad_norm": 9.035933890702934, |
|
"kl": 0.5140625, |
|
"learning_rate": 9.464629290747842e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 2236141.0, |
|
"reward": 2.5825096130371095, |
|
"reward_std": 0.45454559922218324, |
|
"rewards/classifier_reward": 0.9753666520118713, |
|
"rewards/length_reward": 0.6285714417695999, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 246.74286804199218, |
|
"epoch": 0.215, |
|
"grad_norm": 9.318935519161128, |
|
"kl": 0.555078125, |
|
"learning_rate": 9.445355656498284e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2262697.0, |
|
"reward": 2.383002519607544, |
|
"reward_std": 0.5029416978359222, |
|
"rewards/classifier_reward": 0.8830024480819703, |
|
"rewards/length_reward": 0.5428571611642837, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 324.9428741455078, |
|
"epoch": 0.2175, |
|
"grad_norm": 10.476800992230684, |
|
"kl": 0.515234375, |
|
"learning_rate": 9.425761542788048e-07, |
|
"loss": 0.001, |
|
"num_tokens": 2291611.0, |
|
"reward": 2.4253102779388427, |
|
"reward_std": 0.25630177855491637, |
|
"rewards/classifier_reward": 0.9878101944923401, |
|
"rewards/length_reward": 0.4857142955064774, |
|
"rewards/slop_reward": 0.9517857193946838, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 285.88572387695314, |
|
"epoch": 0.22, |
|
"grad_norm": 8.262503873333891, |
|
"kl": 0.623828125, |
|
"learning_rate": 9.40584836221749e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2319400.0, |
|
"reward": 2.459568977355957, |
|
"reward_std": 0.5227903485298157, |
|
"rewards/classifier_reward": 0.9595688700675964, |
|
"rewards/length_reward": 0.5428571581840516, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 361.48572998046876, |
|
"epoch": 0.2225, |
|
"grad_norm": 8.639609161679585, |
|
"kl": 0.61015625, |
|
"learning_rate": 9.385617550389489e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2349972.0, |
|
"reward": 2.21625759601593, |
|
"reward_std": 0.43572868704795836, |
|
"rewards/classifier_reward": 0.8430432677268982, |
|
"rewards/length_reward": 0.4000000089406967, |
|
"rewards/slop_reward": 0.9732142806053161, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 230.5428680419922, |
|
"epoch": 0.225, |
|
"grad_norm": 9.545010126368608, |
|
"kl": 0.606640625, |
|
"learning_rate": 9.36507056580594e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2375941.0, |
|
"reward": 2.5845241069793703, |
|
"reward_std": 0.4674242250621319, |
|
"rewards/classifier_reward": 0.9559526205062866, |
|
"rewards/length_reward": 0.6285714387893677, |
|
"rewards/slop_reward": 1.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.971435546875, |
|
"epoch": 0.2275, |
|
"grad_norm": 8.313767260844978, |
|
"kl": 0.705859375, |
|
"learning_rate": 9.34420888976262e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 2402820.0, |
|
"reward": 2.281861972808838, |
|
"reward_std": 0.5015565395355225, |
|
"rewards/classifier_reward": 0.9032904744148255, |
|
"rewards/length_reward": 0.48571430444717406, |
|
"rewards/slop_reward": 0.8928571343421936, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 285.71430053710935, |
|
"epoch": 0.23, |
|
"grad_norm": 15.296685695089757, |
|
"kl": 12.5765625, |
|
"learning_rate": 9.323034026242377e-07, |
|
"loss": 0.0126, |
|
"num_tokens": 2430740.0, |
|
"reward": 2.457769823074341, |
|
"reward_std": 0.4592562437057495, |
|
"rewards/classifier_reward": 0.9363411664962769, |
|
"rewards/length_reward": 0.542857152223587, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 283.9143035888672, |
|
"epoch": 0.2325, |
|
"grad_norm": 41.32916438788783, |
|
"kl": 0.500390625, |
|
"learning_rate": 9.301547501806724e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 2458445.0, |
|
"reward": 2.432806062698364, |
|
"reward_std": 0.4543603718280792, |
|
"rewards/classifier_reward": 0.8256630301475525, |
|
"rewards/length_reward": 0.6285714387893677, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 231.2571533203125, |
|
"epoch": 0.235, |
|
"grad_norm": 8.514937250494981, |
|
"kl": 0.56875, |
|
"learning_rate": 9.279750865485772e-07, |
|
"loss": 0.001, |
|
"num_tokens": 2484459.0, |
|
"reward": 2.6583719730377195, |
|
"reward_std": 0.41457981467247007, |
|
"rewards/classifier_reward": 0.9726575374603271, |
|
"rewards/length_reward": 0.6857142984867096, |
|
"rewards/slop_reward": 1.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 239.4571563720703, |
|
"epoch": 0.2375, |
|
"grad_norm": 8.130278831610969, |
|
"kl": 0.586328125, |
|
"learning_rate": 9.257645688666555e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2510647.0, |
|
"reward": 2.552411127090454, |
|
"reward_std": 0.444735050201416, |
|
"rewards/classifier_reward": 0.9220538377761841, |
|
"rewards/length_reward": 0.6571428656578064, |
|
"rewards/slop_reward": 0.9732142806053161, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 248.48572692871093, |
|
"epoch": 0.24, |
|
"grad_norm": 11.012318216722969, |
|
"kl": 0.559375, |
|
"learning_rate": 9.235233564979754e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2537166.0, |
|
"reward": 2.3741667747497557, |
|
"reward_std": 0.5231991052627564, |
|
"rewards/classifier_reward": 0.7955952703952789, |
|
"rewards/length_reward": 0.600000011920929, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 242.91429748535157, |
|
"epoch": 0.2425, |
|
"grad_norm": 9.607401037517901, |
|
"kl": 0.503125, |
|
"learning_rate": 9.212516110184794e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 2563588.0, |
|
"reward": 2.6350881576538088, |
|
"reward_std": 0.47173853516578673, |
|
"rewards/classifier_reward": 0.8618737578392028, |
|
"rewards/length_reward": 0.8000000059604645, |
|
"rewards/slop_reward": 0.9732142806053161, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 193.971435546875, |
|
"epoch": 0.245, |
|
"grad_norm": 10.623433793787921, |
|
"kl": 0.7234375, |
|
"learning_rate": 9.189494962053368e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 2588297.0, |
|
"reward": 2.389581322669983, |
|
"reward_std": 0.4812875479459763, |
|
"rewards/classifier_reward": 0.903866958618164, |
|
"rewards/length_reward": 0.4857142984867096, |
|
"rewards/slop_reward": 1.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 171.51429443359376, |
|
"epoch": 0.2475, |
|
"grad_norm": 11.438435269316447, |
|
"kl": 0.64375, |
|
"learning_rate": 9.166171780251364e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2612015.0, |
|
"reward": 2.513836717605591, |
|
"reward_std": 0.26052397638559344, |
|
"rewards/classifier_reward": 0.8852650642395019, |
|
"rewards/length_reward": 0.6285714328289032, |
|
"rewards/slop_reward": 1.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 167.14286346435546, |
|
"epoch": 0.25, |
|
"grad_norm": 11.041028322429328, |
|
"kl": 0.7359375, |
|
"learning_rate": 9.14254824621921e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 2635785.0, |
|
"reward": 2.355943202972412, |
|
"reward_std": 0.33155601024627684, |
|
"rewards/classifier_reward": 0.8416573882102967, |
|
"rewards/length_reward": 0.5142857193946838, |
|
"rewards/slop_reward": 1.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 163.74286499023438, |
|
"epoch": 0.2525, |
|
"grad_norm": 9.00301084087431, |
|
"kl": 0.75390625, |
|
"learning_rate": 9.118626063050661e-07, |
|
"loss": 0.0012, |
|
"num_tokens": 2659436.0, |
|
"reward": 2.5006046295166016, |
|
"reward_std": 0.4252849280834198, |
|
"rewards/classifier_reward": 0.9506044745445251, |
|
"rewards/length_reward": 0.571428582072258, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 195.42858276367187, |
|
"epoch": 0.255, |
|
"grad_norm": 9.702866580028527, |
|
"kl": 0.79921875, |
|
"learning_rate": 9.094406955370008e-07, |
|
"loss": 0.0008, |
|
"num_tokens": 2683861.0, |
|
"reward": 2.5624767780303954, |
|
"reward_std": 0.47246721386909485, |
|
"rewards/classifier_reward": 0.9267622709274292, |
|
"rewards/length_reward": 0.6571428716182709, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 183.8857223510742, |
|
"epoch": 0.2575, |
|
"grad_norm": 7.999927126827247, |
|
"kl": 0.7359375, |
|
"learning_rate": 9.069892669207757e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 2708217.0, |
|
"reward": 2.4374377012252806, |
|
"reward_std": 0.3513069462031126, |
|
"rewards/classifier_reward": 0.8374375879764557, |
|
"rewards/length_reward": 0.6000000059604644, |
|
"rewards/slop_reward": 1.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 203.74286804199218, |
|
"epoch": 0.26, |
|
"grad_norm": 8.535286841601573, |
|
"kl": 0.6046875, |
|
"learning_rate": 9.045084971874737e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2733039.0, |
|
"reward": 2.590514373779297, |
|
"reward_std": 0.3418044149875641, |
|
"rewards/classifier_reward": 0.7905142605304718, |
|
"rewards/length_reward": 0.8000000059604645, |
|
"rewards/slop_reward": 1.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 186.88572387695314, |
|
"epoch": 0.2625, |
|
"grad_norm": 11.150131541909593, |
|
"kl": 2.3078125, |
|
"learning_rate": 9.019985651834703e-07, |
|
"loss": 0.0023, |
|
"num_tokens": 2757500.0, |
|
"reward": 2.6002991676330565, |
|
"reward_std": 0.47049993872642515, |
|
"rewards/classifier_reward": 0.850299060344696, |
|
"rewards/length_reward": 0.7714285790920258, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 231.48572692871093, |
|
"epoch": 0.265, |
|
"grad_norm": 7.346740346808247, |
|
"kl": 0.694140625, |
|
"learning_rate": 8.994596518575391e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 2783522.0, |
|
"reward": 2.6393914222717285, |
|
"reward_std": 0.4599771976470947, |
|
"rewards/classifier_reward": 0.8965341806411743, |
|
"rewards/length_reward": 0.7428571581840515, |
|
"rewards/slop_reward": 1.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 293.51429748535156, |
|
"epoch": 0.2675, |
|
"grad_norm": 7.782363877670758, |
|
"kl": 0.71328125, |
|
"learning_rate": 8.968919402478075e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 2811715.0, |
|
"reward": 2.497778224945068, |
|
"reward_std": 0.49988613873720167, |
|
"rewards/classifier_reward": 0.8692066669464111, |
|
"rewards/length_reward": 0.6285714387893677, |
|
"rewards/slop_reward": 1.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 202.40000915527344, |
|
"epoch": 0.27, |
|
"grad_norm": 8.407812702495294, |
|
"kl": 0.9671875, |
|
"learning_rate": 8.942956154685595e-07, |
|
"loss": 0.001, |
|
"num_tokens": 2836377.0, |
|
"reward": 2.7594990730285645, |
|
"reward_std": 0.35219337940216067, |
|
"rewards/classifier_reward": 0.9237847208976746, |
|
"rewards/length_reward": 0.8571428656578064, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 311.14288024902345, |
|
"epoch": 0.2725, |
|
"grad_norm": 10.954001826900827, |
|
"kl": 0.803515625, |
|
"learning_rate": 8.916708646968923e-07, |
|
"loss": 0.0008, |
|
"num_tokens": 2865187.0, |
|
"reward": 2.3341631174087523, |
|
"reward_std": 0.3795748669654131, |
|
"rewards/classifier_reward": 0.6913058979436755, |
|
"rewards/length_reward": 0.6857142925262452, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 295.88572387695314, |
|
"epoch": 0.275, |
|
"grad_norm": 7.24224369671047, |
|
"kl": 0.6296875, |
|
"learning_rate": 8.890178771592197e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 2893081.0, |
|
"reward": 2.574794292449951, |
|
"reward_std": 0.4517861694097519, |
|
"rewards/classifier_reward": 0.9319370150566101, |
|
"rewards/length_reward": 0.6857142984867096, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 310.4285858154297, |
|
"epoch": 0.2775, |
|
"grad_norm": 6.977209722960256, |
|
"kl": 1.34609375, |
|
"learning_rate": 8.863368441176325e-07, |
|
"loss": 0.0013, |
|
"num_tokens": 2921771.0, |
|
"reward": 2.2888038635253904, |
|
"reward_std": 0.6448704779148102, |
|
"rewards/classifier_reward": 0.8455002188682557, |
|
"rewards/length_reward": 0.5142857253551483, |
|
"rewards/slop_reward": 0.9290178537368774, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.82858276367188, |
|
"epoch": 0.28, |
|
"grad_norm": 6.8381698154250525, |
|
"kl": 0.86640625, |
|
"learning_rate": 8.836279588561081e-07, |
|
"loss": 0.0009, |
|
"num_tokens": 2948383.0, |
|
"reward": 2.4712666511535644, |
|
"reward_std": 0.4860814154148102, |
|
"rewards/classifier_reward": 0.8141236484050751, |
|
"rewards/length_reward": 0.6571428716182709, |
|
"rewards/slop_reward": 1.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 275.74287109375, |
|
"epoch": 0.2825, |
|
"grad_norm": 5.967234970457837, |
|
"kl": 0.8515625, |
|
"learning_rate": 8.808914166665772e-07, |
|
"loss": 0.0013, |
|
"num_tokens": 2975877.0, |
|
"reward": 2.6856667041778564, |
|
"reward_std": 0.38376912772655486, |
|
"rewards/classifier_reward": 0.9856665849685669, |
|
"rewards/length_reward": 0.7428571522235871, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 291.1428680419922, |
|
"epoch": 0.285, |
|
"grad_norm": 6.696088875243213, |
|
"kl": 0.61484375, |
|
"learning_rate": 8.781274148348436e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 3003901.0, |
|
"reward": 2.584765911102295, |
|
"reward_std": 0.441849821805954, |
|
"rewards/classifier_reward": 0.8704800248146057, |
|
"rewards/length_reward": 0.7142857313156128, |
|
"rewards/slop_reward": 1.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 286.6000152587891, |
|
"epoch": 0.2875, |
|
"grad_norm": 5.913822453334042, |
|
"kl": 0.678125, |
|
"learning_rate": 8.753361526263621e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 3031852.0, |
|
"reward": 2.6028482913970947, |
|
"reward_std": 0.2958831213414669, |
|
"rewards/classifier_reward": 0.9957053542137146, |
|
"rewards/length_reward": 0.6285714387893677, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 292.20001525878905, |
|
"epoch": 0.29, |
|
"grad_norm": 5.738716860165764, |
|
"kl": 0.72578125, |
|
"learning_rate": 8.725178312718725e-07, |
|
"loss": 0.0012, |
|
"num_tokens": 3059999.0, |
|
"reward": 2.596400237083435, |
|
"reward_std": 0.3507813632488251, |
|
"rewards/classifier_reward": 0.953542971611023, |
|
"rewards/length_reward": 0.6857142925262452, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 239.1428680419922, |
|
"epoch": 0.2925, |
|
"grad_norm": 69.40308583594562, |
|
"kl": 1.6, |
|
"learning_rate": 8.696726539528923e-07, |
|
"loss": 0.0021, |
|
"num_tokens": 3086289.0, |
|
"reward": 2.7766035079956053, |
|
"reward_std": 0.3328893929719925, |
|
"rewards/classifier_reward": 0.8908890843391418, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 1.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 259.22858276367185, |
|
"epoch": 0.295, |
|
"grad_norm": 5.40703806614853, |
|
"kl": 0.734765625, |
|
"learning_rate": 8.668008257870682e-07, |
|
"loss": 0.0012, |
|
"num_tokens": 3113282.0, |
|
"reward": 2.7762694358825684, |
|
"reward_std": 0.2866129666566849, |
|
"rewards/classifier_reward": 0.9476978421211243, |
|
"rewards/length_reward": 0.8285714328289032, |
|
"rewards/slop_reward": 1.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 201.91429443359374, |
|
"epoch": 0.2975, |
|
"grad_norm": 7.252699872821293, |
|
"kl": 0.81015625, |
|
"learning_rate": 8.639025538133897e-07, |
|
"loss": 0.0013, |
|
"num_tokens": 3138256.0, |
|
"reward": 2.851440095901489, |
|
"reward_std": 0.20830639004707335, |
|
"rewards/classifier_reward": 0.9085827589035034, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 250.20001525878905, |
|
"epoch": 0.3, |
|
"grad_norm": 6.062851209216701, |
|
"kl": 0.92578125, |
|
"learning_rate": 8.609780469772621e-07, |
|
"loss": 0.0014, |
|
"num_tokens": 3164933.0, |
|
"reward": 2.786311960220337, |
|
"reward_std": 0.29145972728729247, |
|
"rewards/classifier_reward": 0.9005975008010865, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 1.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 199.4857208251953, |
|
"epoch": 0.3025, |
|
"grad_norm": 8.363986536112053, |
|
"kl": 0.746875, |
|
"learning_rate": 8.580275161154431e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 3189764.0, |
|
"reward": 2.89967794418335, |
|
"reward_std": 0.15721405297517776, |
|
"rewards/classifier_reward": 0.9496778607368469, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 218.88572387695314, |
|
"epoch": 0.305, |
|
"grad_norm": 104.91081313736798, |
|
"kl": 17.3171875, |
|
"learning_rate": 8.550511739408428e-07, |
|
"loss": 0.0182, |
|
"num_tokens": 3215345.0, |
|
"reward": 2.980521392822266, |
|
"reward_std": 0.05153606534004211, |
|
"rewards/classifier_reward": 0.9805211901664734, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 222.91430053710937, |
|
"epoch": 0.3075, |
|
"grad_norm": 7.509956960673518, |
|
"kl": 0.934375, |
|
"learning_rate": 8.520492350271895e-07, |
|
"loss": 0.001, |
|
"num_tokens": 3241067.0, |
|
"reward": 2.8307112216949464, |
|
"reward_std": 0.1734127746662125, |
|
"rewards/classifier_reward": 0.8521397054195404, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.17144165039062, |
|
"epoch": 0.31, |
|
"grad_norm": 6.959659369798759, |
|
"kl": 1.028125, |
|
"learning_rate": 8.490219157935588e-07, |
|
"loss": 0.0015, |
|
"num_tokens": 3267393.0, |
|
"reward": 2.7303539276123048, |
|
"reward_std": 0.1922714289277792, |
|
"rewards/classifier_reward": 0.9517823934555054, |
|
"rewards/length_reward": 0.8000000029802322, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.5428680419922, |
|
"epoch": 0.3125, |
|
"grad_norm": 8.221210886450612, |
|
"kl": 1.91640625, |
|
"learning_rate": 8.459694344887731e-07, |
|
"loss": 0.0019, |
|
"num_tokens": 3293186.0, |
|
"reward": 2.739912986755371, |
|
"reward_std": 0.36319895684719083, |
|
"rewards/classifier_reward": 0.9541985750198364, |
|
"rewards/length_reward": 0.8285714328289032, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 216.02858276367186, |
|
"epoch": 0.315, |
|
"grad_norm": 6.737006006501442, |
|
"kl": 1.015625, |
|
"learning_rate": 8.428920111756657e-07, |
|
"loss": 0.0015, |
|
"num_tokens": 3318667.0, |
|
"reward": 2.7134992361068724, |
|
"reward_std": 0.2444542996585369, |
|
"rewards/classifier_reward": 0.8563561499118805, |
|
"rewards/length_reward": 0.8571428596973419, |
|
"rewards/slop_reward": 1.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 226.20001220703125, |
|
"epoch": 0.3175, |
|
"grad_norm": 4.36908357763648, |
|
"kl": 0.96640625, |
|
"learning_rate": 8.397898677152172e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 3344503.0, |
|
"reward": 2.9305933475494386, |
|
"reward_std": 0.12042829990386963, |
|
"rewards/classifier_reward": 0.9591646075248719, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 203.31429443359374, |
|
"epoch": 0.32, |
|
"grad_norm": 6.1156992776971855, |
|
"kl": 0.85703125, |
|
"learning_rate": 8.366632277505597e-07, |
|
"loss": 0.0018, |
|
"num_tokens": 3369294.0, |
|
"reward": 2.9620502471923826, |
|
"reward_std": 0.07551092505455018, |
|
"rewards/classifier_reward": 0.983478581905365, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.0285858154297, |
|
"epoch": 0.3225, |
|
"grad_norm": 6.8222141729156975, |
|
"kl": 0.85859375, |
|
"learning_rate": 8.335123166908543e-07, |
|
"loss": 0.0013, |
|
"num_tokens": 3394915.0, |
|
"reward": 2.7929779529571532, |
|
"reward_std": 0.28287690281867983, |
|
"rewards/classifier_reward": 0.9340491890907288, |
|
"rewards/length_reward": 0.8857142865657807, |
|
"rewards/slop_reward": 0.9732142806053161, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 225.62857971191406, |
|
"epoch": 0.325, |
|
"grad_norm": 4.536469246397557, |
|
"kl": 1.11015625, |
|
"learning_rate": 8.303373616950406e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 3420549.0, |
|
"reward": 2.9556642055511473, |
|
"reward_std": 0.07710518054664135, |
|
"rewards/classifier_reward": 0.9985211491584778, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.68572387695312, |
|
"epoch": 0.3275, |
|
"grad_norm": 2.9694599664159407, |
|
"kl": 0.9546875, |
|
"learning_rate": 8.271385916554604e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 3445788.0, |
|
"reward": 2.996094989776611, |
|
"reward_std": 0.010332237184047698, |
|
"rewards/classifier_reward": 0.9960947871208191, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 296.20001831054685, |
|
"epoch": 0.33, |
|
"grad_norm": 4.064839178436649, |
|
"kl": 1.1390625, |
|
"learning_rate": 8.23916237181355e-07, |
|
"loss": 0.0016, |
|
"num_tokens": 3474044.0, |
|
"reward": 2.4268852710723876, |
|
"reward_std": 0.15440489053726197, |
|
"rewards/classifier_reward": 0.9697423577308655, |
|
"rewards/length_reward": 0.4571428596973419, |
|
"rewards/slop_reward": 1.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 282.771435546875, |
|
"epoch": 0.3325, |
|
"grad_norm": 10.750678349276473, |
|
"kl": 0.82734375, |
|
"learning_rate": 8.206705305822412e-07, |
|
"loss": 0.0013, |
|
"num_tokens": 3501861.0, |
|
"reward": 2.4831544876098635, |
|
"reward_std": 0.26082203090190886, |
|
"rewards/classifier_reward": 0.9831543445587159, |
|
"rewards/length_reward": 0.5428571492433548, |
|
"rewards/slop_reward": 0.9571428537368775, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.34286804199218, |
|
"epoch": 0.335, |
|
"grad_norm": 106.5245827902456, |
|
"kl": 88.11328125, |
|
"learning_rate": 8.174017058511628e-07, |
|
"loss": 0.0893, |
|
"num_tokens": 3528356.0, |
|
"reward": 2.8546416759490967, |
|
"reward_std": 0.2578580856323242, |
|
"rewards/classifier_reward": 0.9903557300567627, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 280.2857299804688, |
|
"epoch": 0.3375, |
|
"grad_norm": 5.075379696681929, |
|
"kl": 1.60859375, |
|
"learning_rate": 8.141099986478212e-07, |
|
"loss": 0.0021, |
|
"num_tokens": 3555922.0, |
|
"reward": 2.635714387893677, |
|
"reward_std": 0.31418272852897644, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.6571428656578064, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 304.5714416503906, |
|
"epoch": 0.34, |
|
"grad_norm": 5.929984152853479, |
|
"kl": 1.1859375, |
|
"learning_rate": 8.107956462815861e-07, |
|
"loss": 0.0017, |
|
"num_tokens": 3584471.0, |
|
"reward": 2.3495986461639404, |
|
"reward_std": 0.3951677083969116, |
|
"rewards/classifier_reward": 0.8853127479553222, |
|
"rewards/length_reward": 0.4857142955064774, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 288.2857299804688, |
|
"epoch": 0.3425, |
|
"grad_norm": 8.1455480169376, |
|
"kl": 1.215625, |
|
"learning_rate": 8.074588876943872e-07, |
|
"loss": 0.0012, |
|
"num_tokens": 3612481.0, |
|
"reward": 2.4660715579986574, |
|
"reward_std": 0.5484442114830017, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.5142857283353806, |
|
"rewards/slop_reward": 0.9517857074737549, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 267.8000061035156, |
|
"epoch": 0.345, |
|
"grad_norm": 5.930202214193798, |
|
"kl": 0.98125, |
|
"learning_rate": 8.040999634434882e-07, |
|
"loss": 0.0015, |
|
"num_tokens": 3639774.0, |
|
"reward": 2.7785715579986574, |
|
"reward_std": 0.39145426750183104, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.800000011920929, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.57144165039062, |
|
"epoch": 0.3475, |
|
"grad_norm": 4.399939053025549, |
|
"kl": 1.53125, |
|
"learning_rate": 8.00719115684144e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 3666639.0, |
|
"reward": 2.6642858505249025, |
|
"reward_std": 0.21827136874198913, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.6857142925262452, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.571435546875, |
|
"epoch": 0.35, |
|
"grad_norm": 6.704733780428727, |
|
"kl": 1.0859375, |
|
"learning_rate": 7.973165881521433e-07, |
|
"loss": 0.002, |
|
"num_tokens": 3693159.0, |
|
"reward": 2.8857144355773925, |
|
"reward_std": 0.24877579212188722, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 1.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 248.9428680419922, |
|
"epoch": 0.3525, |
|
"grad_norm": 3.4261047442168397, |
|
"kl": 0.93359375, |
|
"learning_rate": 7.938926261462365e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 3719792.0, |
|
"reward": 2.742554450035095, |
|
"reward_std": 0.16382334232330323, |
|
"rewards/classifier_reward": 0.8568399548530579, |
|
"rewards/length_reward": 0.8857142865657807, |
|
"rewards/slop_reward": 1.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 212.3714385986328, |
|
"epoch": 0.355, |
|
"grad_norm": 6.044238574227886, |
|
"kl": 1.3890625, |
|
"learning_rate": 7.90447476510452e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 3745103.0, |
|
"reward": 2.9194665908813477, |
|
"reward_std": 0.21307192444801332, |
|
"rewards/classifier_reward": 0.9980378150939941, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 184.5428680419922, |
|
"epoch": 0.3575, |
|
"grad_norm": 4.2944694392303155, |
|
"kl": 1.140625, |
|
"learning_rate": 7.869813876162998e-07, |
|
"loss": 0.003, |
|
"num_tokens": 3769090.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 207.1428680419922, |
|
"epoch": 0.36, |
|
"grad_norm": 2.4809967074334627, |
|
"kl": 1.1765625, |
|
"learning_rate": 7.834946093448658e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 3794079.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 197.17143859863282, |
|
"epoch": 0.3625, |
|
"grad_norm": 3.755882982422092, |
|
"kl": 1.396875, |
|
"learning_rate": 7.799873930687977e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 3818773.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 187.71429443359375, |
|
"epoch": 0.365, |
|
"grad_norm": 6.482025225350517, |
|
"kl": 2.0046875, |
|
"learning_rate": 7.764599916341816e-07, |
|
"loss": 0.003, |
|
"num_tokens": 3843103.0, |
|
"reward": 2.892857313156128, |
|
"reward_std": 0.22987756729125977, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9142857193946838, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 214.8857208251953, |
|
"epoch": 0.3675, |
|
"grad_norm": 3.8265113738201806, |
|
"kl": 1.3890625, |
|
"learning_rate": 7.729126593423149e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 3868513.0, |
|
"reward": 2.997367763519287, |
|
"reward_std": 0.006964774429798126, |
|
"rewards/classifier_reward": 0.9973675608634949, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 184.1428649902344, |
|
"epoch": 0.37, |
|
"grad_norm": 18.75869388269992, |
|
"kl": 1.515625, |
|
"learning_rate": 7.693456519313719e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 3892878.0, |
|
"reward": 2.851199245452881, |
|
"reward_std": 0.18016420006752015, |
|
"rewards/classifier_reward": 0.9869133591651916, |
|
"rewards/length_reward": 0.8857142865657807, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 193.6571502685547, |
|
"epoch": 0.3725, |
|
"grad_norm": 5.454996617575025, |
|
"kl": 1.684375, |
|
"learning_rate": 7.657592265579669e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 3917511.0, |
|
"reward": 2.993258571624756, |
|
"reward_std": 0.01783668529242277, |
|
"rewards/classifier_reward": 0.9932583689689636, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 181.31429138183594, |
|
"epoch": 0.375, |
|
"grad_norm": 3.955698987070301, |
|
"kl": 1.3390625, |
|
"learning_rate": 7.621536417786158e-07, |
|
"loss": 0.0032, |
|
"num_tokens": 3941554.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 183.9714385986328, |
|
"epoch": 0.3775, |
|
"grad_norm": 4.5979098970153185, |
|
"kl": 1.528125, |
|
"learning_rate": 7.585291575310952e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 3965818.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 175.71429443359375, |
|
"epoch": 0.38, |
|
"grad_norm": 7.501271856937242, |
|
"kl": 1.790625, |
|
"learning_rate": 7.548860351157027e-07, |
|
"loss": 0.0027, |
|
"num_tokens": 3989746.0, |
|
"reward": 2.7536909580230713, |
|
"reward_std": 0.3092236161231995, |
|
"rewards/classifier_reward": 0.9251193881034852, |
|
"rewards/length_reward": 0.8285714387893677, |
|
"rewards/slop_reward": 1.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 210.571435546875, |
|
"epoch": 0.3825, |
|
"grad_norm": 5.2866175605, |
|
"kl": 2.065625, |
|
"learning_rate": 7.512245371764196e-07, |
|
"loss": 0.0035, |
|
"num_tokens": 4015036.0, |
|
"reward": 2.9245490074157714, |
|
"reward_std": 0.12335940003395081, |
|
"rewards/classifier_reward": 0.9727631211280823, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9517857074737549, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 187.771435546875, |
|
"epoch": 0.385, |
|
"grad_norm": 4.602093297361531, |
|
"kl": 2.203125, |
|
"learning_rate": 7.475449276819752e-07, |
|
"loss": 0.0041, |
|
"num_tokens": 4039528.0, |
|
"reward": 2.942857360839844, |
|
"reward_std": 0.09759000539779664, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 175.60000610351562, |
|
"epoch": 0.3875, |
|
"grad_norm": 6.76968523287945, |
|
"kl": 1.71875, |
|
"learning_rate": 7.438474719068173e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 4063594.0, |
|
"reward": 2.914285898208618, |
|
"reward_std": 0.1731828987598419, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9142857193946838, |
|
"rewards/slop_reward": 1.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 181.31429443359374, |
|
"epoch": 0.39, |
|
"grad_norm": 3.9415238872059795, |
|
"kl": 3.5171875, |
|
"learning_rate": 7.401324364119871e-07, |
|
"loss": 0.0054, |
|
"num_tokens": 4087555.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 199.22858276367188, |
|
"epoch": 0.3925, |
|
"grad_norm": 0.6743528375242909, |
|
"kl": 1.6703125, |
|
"learning_rate": 7.364000890259023e-07, |
|
"loss": 0.0041, |
|
"num_tokens": 4112265.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 203.02857971191406, |
|
"epoch": 0.395, |
|
"grad_norm": 0.21235184389038209, |
|
"kl": 1.5953125, |
|
"learning_rate": 7.326506988250487e-07, |
|
"loss": 0.004, |
|
"num_tokens": 4137291.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 211.00000915527343, |
|
"epoch": 0.3975, |
|
"grad_norm": 0.7871274406295002, |
|
"kl": 1.9828125, |
|
"learning_rate": 7.288845361145812e-07, |
|
"loss": 0.0044, |
|
"num_tokens": 4162596.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 205.82857971191407, |
|
"epoch": 0.4, |
|
"grad_norm": 0.3126714050105658, |
|
"kl": 1.596875, |
|
"learning_rate": 7.251018724088366e-07, |
|
"loss": 0.004, |
|
"num_tokens": 4187048.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 197.42857666015624, |
|
"epoch": 0.4025, |
|
"grad_norm": 0.3330658614195518, |
|
"kl": 1.5875, |
|
"learning_rate": 7.213029804117603e-07, |
|
"loss": 0.004, |
|
"num_tokens": 4211839.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 182.74286499023438, |
|
"epoch": 0.405, |
|
"grad_norm": 0.09053574072038517, |
|
"kl": 1.2125, |
|
"learning_rate": 7.174881339972448e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 4236155.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 189.971435546875, |
|
"epoch": 0.4075, |
|
"grad_norm": 3.922781639240598, |
|
"kl": 1.1859375, |
|
"learning_rate": 7.136576081893863e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 4260724.0, |
|
"reward": 2.968118953704834, |
|
"reward_std": 0.07463454008102417, |
|
"rewards/classifier_reward": 0.9966901540756226, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 211.22857971191405, |
|
"epoch": 0.41, |
|
"grad_norm": 0.5831119895182555, |
|
"kl": 1.22265625, |
|
"learning_rate": 7.09811679142657e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 4285850.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 203.71429748535155, |
|
"epoch": 0.4125, |
|
"grad_norm": 2.3433145018681825, |
|
"kl": 1.1546875, |
|
"learning_rate": 7.059506241219964e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 4310900.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 200.91429443359374, |
|
"epoch": 0.415, |
|
"grad_norm": 0.18814570159680577, |
|
"kl": 1.1671875, |
|
"learning_rate": 7.02074721482822e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 4335852.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 212.08572692871093, |
|
"epoch": 0.4175, |
|
"grad_norm": 3.9253886023797246, |
|
"kl": 1.08125, |
|
"learning_rate": 6.981842506509626e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 4361111.0, |
|
"reward": 2.950000190734863, |
|
"reward_std": 0.13228756189346313, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 202.82857971191407, |
|
"epoch": 0.42, |
|
"grad_norm": 118.43288868650292, |
|
"kl": 36.20390625, |
|
"learning_rate": 6.942794921025126e-07, |
|
"loss": 0.0382, |
|
"num_tokens": 4386130.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 194.0571533203125, |
|
"epoch": 0.4225, |
|
"grad_norm": 0.07954465329541446, |
|
"kl": 1.03984375, |
|
"learning_rate": 6.903607273436127e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 4410840.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 193.7714385986328, |
|
"epoch": 0.425, |
|
"grad_norm": 0.0486408492542655, |
|
"kl": 0.978125, |
|
"learning_rate": 6.864282388901543e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 4435370.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 208.08572692871093, |
|
"epoch": 0.4275, |
|
"grad_norm": 2.6728463524577752, |
|
"kl": 0.92265625, |
|
"learning_rate": 6.824823102474126e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 4460308.0, |
|
"reward": 2.942857360839844, |
|
"reward_std": 0.09759000539779664, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 202.22858276367188, |
|
"epoch": 0.43, |
|
"grad_norm": 0.13449796435482986, |
|
"kl": 1.034375, |
|
"learning_rate": 6.785232258896076e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 4485226.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 182.68572082519532, |
|
"epoch": 0.4325, |
|
"grad_norm": 7.189835012946043, |
|
"kl": 0.95078125, |
|
"learning_rate": 6.745512712393957e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 4509446.0, |
|
"reward": 2.950000190734863, |
|
"reward_std": 0.13228756189346313, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 192.20000915527345, |
|
"epoch": 0.435, |
|
"grad_norm": 3.314581838568204, |
|
"kl": 0.9203125, |
|
"learning_rate": 6.705667326472924e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 4533638.0, |
|
"reward": 2.992787170410156, |
|
"reward_std": 0.019083873927593233, |
|
"rewards/classifier_reward": 0.992786979675293, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 181.7714416503906, |
|
"epoch": 0.4375, |
|
"grad_norm": 4.868845585454781, |
|
"kl": 0.91015625, |
|
"learning_rate": 6.665698973710288e-07, |
|
"loss": 0.0023, |
|
"num_tokens": 4557920.0, |
|
"reward": 2.914285898208618, |
|
"reward_std": 0.1731828987598419, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9142857193946838, |
|
"rewards/slop_reward": 1.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 202.4857208251953, |
|
"epoch": 0.44, |
|
"grad_norm": 5.277192051669204, |
|
"kl": 0.9015625, |
|
"learning_rate": 6.625610535548417e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 4582927.0, |
|
"reward": 2.971197080612183, |
|
"reward_std": 0.07549313902854919, |
|
"rewards/classifier_reward": 0.9997682809829712, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 191.3714385986328, |
|
"epoch": 0.4425, |
|
"grad_norm": 3.5895293665237213, |
|
"kl": 0.86953125, |
|
"learning_rate": 6.58540490208701e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 4607545.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 204.74286804199218, |
|
"epoch": 0.445, |
|
"grad_norm": 3.6924099151163348, |
|
"kl": 0.88046875, |
|
"learning_rate": 6.545084971874736e-07, |
|
"loss": 0.0023, |
|
"num_tokens": 4632631.0, |
|
"reward": 2.9700303077697754, |
|
"reward_std": 0.07929292395710945, |
|
"rewards/classifier_reward": 0.9986015200614929, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 210.74286804199218, |
|
"epoch": 0.4475, |
|
"grad_norm": 4.558365673779022, |
|
"kl": 1.03125, |
|
"learning_rate": 6.504653651700277e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 4657813.0, |
|
"reward": 2.950000190734863, |
|
"reward_std": 0.13228756189346313, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 209.17143859863282, |
|
"epoch": 0.45, |
|
"grad_norm": 0.04420464498654459, |
|
"kl": 0.86640625, |
|
"learning_rate": 6.464113856382751e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 4683054.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 218.4857208251953, |
|
"epoch": 0.4525, |
|
"grad_norm": 2.557396683384517, |
|
"kl": 1.0640625, |
|
"learning_rate": 6.423468508561598e-07, |
|
"loss": 0.003, |
|
"num_tokens": 4708257.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.88572387695314, |
|
"epoch": 0.455, |
|
"grad_norm": 3.1120823972576854, |
|
"kl": 1.415625, |
|
"learning_rate": 6.382720538485855e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 4734223.0, |
|
"reward": 2.942857360839844, |
|
"reward_std": 0.09759000539779664, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 226.40000915527344, |
|
"epoch": 0.4575, |
|
"grad_norm": 0.06880695435363977, |
|
"kl": 0.846875, |
|
"learning_rate": 6.341872883802922e-07, |
|
"loss": 0.0032, |
|
"num_tokens": 4759812.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.05715637207032, |
|
"epoch": 0.46, |
|
"grad_norm": 2.2167569033565937, |
|
"kl": 0.75546875, |
|
"learning_rate": 6.300928489346765e-07, |
|
"loss": 0.0027, |
|
"num_tokens": 4785935.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 227.91429443359374, |
|
"epoch": 0.4625, |
|
"grad_norm": 2.5442698264193897, |
|
"kl": 0.7765625, |
|
"learning_rate": 6.259890306925626e-07, |
|
"loss": 0.0027, |
|
"num_tokens": 4811832.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 221.2571533203125, |
|
"epoch": 0.465, |
|
"grad_norm": 4.529163471508088, |
|
"kl": 0.79765625, |
|
"learning_rate": 6.218761295109208e-07, |
|
"loss": 0.0018, |
|
"num_tokens": 4837398.0, |
|
"reward": 2.9214287281036375, |
|
"reward_std": 0.2078804552555084, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 218.40000915527344, |
|
"epoch": 0.4675, |
|
"grad_norm": 0.04502419697939018, |
|
"kl": 0.8265625, |
|
"learning_rate": 6.177544419015387e-07, |
|
"loss": 0.0032, |
|
"num_tokens": 4862776.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.85715026855468, |
|
"epoch": 0.47, |
|
"grad_norm": 2.7870230542981322, |
|
"kl": 0.825, |
|
"learning_rate": 6.13624265009645e-07, |
|
"loss": 0.0027, |
|
"num_tokens": 4888566.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.0571502685547, |
|
"epoch": 0.4725, |
|
"grad_norm": 0.06424356320573535, |
|
"kl": 0.8890625, |
|
"learning_rate": 6.094858965924866e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 4914374.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.4571533203125, |
|
"epoch": 0.475, |
|
"grad_norm": 4.417906183651794, |
|
"kl": 0.90703125, |
|
"learning_rate": 6.053396349978631e-07, |
|
"loss": 0.0023, |
|
"num_tokens": 4940085.0, |
|
"reward": 2.8559008598327638, |
|
"reward_std": 0.20502071976661682, |
|
"rewards/classifier_reward": 0.9344720721244812, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.57143859863282, |
|
"epoch": 0.4775, |
|
"grad_norm": 0.7826472563807869, |
|
"kl": 0.95703125, |
|
"learning_rate": 6.011857791426178e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 4966005.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.171435546875, |
|
"epoch": 0.48, |
|
"grad_norm": 0.2991397618360992, |
|
"kl": 1.06015625, |
|
"learning_rate": 5.970246284910876e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 4991803.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 245.6285827636719, |
|
"epoch": 0.4825, |
|
"grad_norm": 4.590555458209977, |
|
"kl": 0.915625, |
|
"learning_rate": 5.92856483033514e-07, |
|
"loss": 0.0023, |
|
"num_tokens": 5018320.0, |
|
"reward": 2.950000190734863, |
|
"reward_std": 0.13228756189346313, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 231.00001220703126, |
|
"epoch": 0.485, |
|
"grad_norm": 3.8249513457346622, |
|
"kl": 0.83125, |
|
"learning_rate": 5.886816432644154e-07, |
|
"loss": 0.0023, |
|
"num_tokens": 5044075.0, |
|
"reward": 2.9297013759613035, |
|
"reward_std": 0.12975128293037413, |
|
"rewards/classifier_reward": 0.9797011494636536, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 232.00000915527343, |
|
"epoch": 0.4875, |
|
"grad_norm": 3.4135057358827834, |
|
"kl": 1.3140625, |
|
"learning_rate": 5.845004101609246e-07, |
|
"loss": 0.0032, |
|
"num_tokens": 5069796.0, |
|
"reward": 2.9581347465515138, |
|
"reward_std": 0.10335763692855834, |
|
"rewards/classifier_reward": 0.9867059469223023, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 236.68572692871095, |
|
"epoch": 0.49, |
|
"grad_norm": 4.013642506936478, |
|
"kl": 0.85390625, |
|
"learning_rate": 5.803130851610885e-07, |
|
"loss": 0.0023, |
|
"num_tokens": 5095958.0, |
|
"reward": 2.9428573131561278, |
|
"reward_std": 0.15118578672409058, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.4571533203125, |
|
"epoch": 0.4925, |
|
"grad_norm": 4.603644905524406, |
|
"kl": 0.940625, |
|
"learning_rate": 5.761199701421391e-07, |
|
"loss": 0.0019, |
|
"num_tokens": 5121931.0, |
|
"reward": 2.941946840286255, |
|
"reward_std": 0.15359463561326264, |
|
"rewards/classifier_reward": 0.9990895390510559, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.02857971191406, |
|
"epoch": 0.495, |
|
"grad_norm": 2.4430881235820507, |
|
"kl": 0.88046875, |
|
"learning_rate": 5.719213673987276e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 5148140.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.7714385986328, |
|
"epoch": 0.4975, |
|
"grad_norm": 2.885447633946335, |
|
"kl": 0.8984375, |
|
"learning_rate": 5.677175796211332e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 5173797.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 234.77144470214844, |
|
"epoch": 0.5, |
|
"grad_norm": 0.04186680424978939, |
|
"kl": 0.9296875, |
|
"learning_rate": 5.635089098734393e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5199798.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.51429443359376, |
|
"epoch": 0.5025, |
|
"grad_norm": 2.794225657350946, |
|
"kl": 0.95, |
|
"learning_rate": 5.592956615716866e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 5225576.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 216.02857971191406, |
|
"epoch": 0.505, |
|
"grad_norm": 4.294508436959706, |
|
"kl": 1.10625, |
|
"learning_rate": 5.550781384619973e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 5251038.0, |
|
"reward": 2.975967788696289, |
|
"reward_std": 0.0635837346315384, |
|
"rewards/classifier_reward": 0.9759676098823548, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 218.571435546875, |
|
"epoch": 0.5075, |
|
"grad_norm": 0.043507163632059365, |
|
"kl": 0.91484375, |
|
"learning_rate": 5.50856644598678e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5276396.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 205.6571502685547, |
|
"epoch": 0.51, |
|
"grad_norm": 0.09961196396275367, |
|
"kl": 1.0578125, |
|
"learning_rate": 5.466314843222993e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 5301460.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 225.9714385986328, |
|
"epoch": 0.5125, |
|
"grad_norm": 3.605683444813084, |
|
"kl": 0.9609375, |
|
"learning_rate": 5.424029622377546e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 5327289.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 215.80001220703124, |
|
"epoch": 0.515, |
|
"grad_norm": 0.19828091837058953, |
|
"kl": 1.23828125, |
|
"learning_rate": 5.381713831923007e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 5352596.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 214.8571563720703, |
|
"epoch": 0.5175, |
|
"grad_norm": 2.9547607246974565, |
|
"kl": 1.12890625, |
|
"learning_rate": 5.339370522535804e-07, |
|
"loss": 0.003, |
|
"num_tokens": 5377938.0, |
|
"reward": 2.992868709564209, |
|
"reward_std": 0.018868234753608704, |
|
"rewards/classifier_reward": 0.9928684830665588, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 252.00001525878906, |
|
"epoch": 0.52, |
|
"grad_norm": 871.3268746641542, |
|
"kl": 64.790625, |
|
"learning_rate": 5.297002746876284e-07, |
|
"loss": 0.0667, |
|
"num_tokens": 5404678.0, |
|
"reward": 2.942857360839844, |
|
"reward_std": 0.09759000539779664, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 271.0857269287109, |
|
"epoch": 0.5225, |
|
"grad_norm": 13961.922953195786, |
|
"kl": 6094.1890625, |
|
"learning_rate": 5.254613559368648e-07, |
|
"loss": 6.1111, |
|
"num_tokens": 5432086.0, |
|
"reward": 2.8857144832611086, |
|
"reward_std": 0.18249738812446595, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 1.0, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 257.9714385986328, |
|
"epoch": 0.525, |
|
"grad_norm": 8.061516897407943, |
|
"kl": 1.028125, |
|
"learning_rate": 5.212206015980741e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 5459016.0, |
|
"reward": 2.828571605682373, |
|
"reward_std": 0.1731828987598419, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8285714328289032, |
|
"rewards/slop_reward": 1.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 234.3714385986328, |
|
"epoch": 0.5275, |
|
"grad_norm": 0.05294394256827684, |
|
"kl": 1.0125, |
|
"learning_rate": 5.169783174003744e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 5484886.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.20001220703125, |
|
"epoch": 0.53, |
|
"grad_norm": 2.7195604219419645, |
|
"kl": 1.353125, |
|
"learning_rate": 5.127348091831755e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5511353.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.80001220703124, |
|
"epoch": 0.5325, |
|
"grad_norm": 2.435046990313359, |
|
"kl": 0.98671875, |
|
"learning_rate": 5.084903828741312e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 5537879.0, |
|
"reward": 2.828571653366089, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8285714298486709, |
|
"rewards/slop_reward": 1.0, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 233.82858276367188, |
|
"epoch": 0.535, |
|
"grad_norm": 2.848539339582319, |
|
"kl": 1.053125, |
|
"learning_rate": 5.042453444670828e-07, |
|
"loss": 0.003, |
|
"num_tokens": 5563937.0, |
|
"reward": 2.9500002384185793, |
|
"reward_std": 0.0866025447845459, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.6285827636719, |
|
"epoch": 0.5375, |
|
"grad_norm": 4.574390608580604, |
|
"kl": 0.91953125, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 5590384.0, |
|
"reward": 2.8857144832611086, |
|
"reward_std": 0.19518001079559327, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 1.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 284.0571563720703, |
|
"epoch": 0.54, |
|
"grad_norm": 5.393964971542159, |
|
"kl": 1.16484375, |
|
"learning_rate": 4.957546555329173e-07, |
|
"loss": 0.0016, |
|
"num_tokens": 5618238.0, |
|
"reward": 2.6285715103149414, |
|
"reward_std": 0.25809029340744016, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.6285714328289032, |
|
"rewards/slop_reward": 1.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.40001525878907, |
|
"epoch": 0.5425, |
|
"grad_norm": 3.809477712898502, |
|
"kl": 0.7828125, |
|
"learning_rate": 4.915096171258689e-07, |
|
"loss": 0.0022, |
|
"num_tokens": 5644712.0, |
|
"reward": 2.828571605682373, |
|
"reward_std": 0.17318291068077088, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8285714328289032, |
|
"rewards/slop_reward": 1.0, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 227.60001220703126, |
|
"epoch": 0.545, |
|
"grad_norm": 0.7163836490261896, |
|
"kl": 1.0765625, |
|
"learning_rate": 4.872651908168244e-07, |
|
"loss": 0.0035, |
|
"num_tokens": 5670466.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 210.7714416503906, |
|
"epoch": 0.5475, |
|
"grad_norm": 0.06375443345853048, |
|
"kl": 0.86953125, |
|
"learning_rate": 4.830216825996256e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5695540.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 212.02858276367186, |
|
"epoch": 0.55, |
|
"grad_norm": 0.044596037608717574, |
|
"kl": 0.8765625, |
|
"learning_rate": 4.787793984019259e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5720881.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 200.80000915527344, |
|
"epoch": 0.5525, |
|
"grad_norm": 0.16974934473523362, |
|
"kl": 1.16640625, |
|
"learning_rate": 4.7453864406313536e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 5745792.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 211.22857971191405, |
|
"epoch": 0.555, |
|
"grad_norm": 0.0818904708032968, |
|
"kl": 0.9921875, |
|
"learning_rate": 4.7029972531237154e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 5770873.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 195.2571533203125, |
|
"epoch": 0.5575, |
|
"grad_norm": 0.05116820241495938, |
|
"kl": 0.9, |
|
"learning_rate": 4.6606294774641965e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5795571.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 190.57143859863282, |
|
"epoch": 0.56, |
|
"grad_norm": 3.069339312788395, |
|
"kl": 0.9046875, |
|
"learning_rate": 4.6182861680769923e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 5819962.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 195.08572387695312, |
|
"epoch": 0.5625, |
|
"grad_norm": 2.051249437321434, |
|
"kl": 0.890625, |
|
"learning_rate": 4.5759703776224555e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 5844710.0, |
|
"reward": 2.954781198501587, |
|
"reward_std": 0.0808977723121643, |
|
"rewards/classifier_reward": 0.9833523750305175, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 194.11429748535156, |
|
"epoch": 0.565, |
|
"grad_norm": 0.06947745807380493, |
|
"kl": 0.915625, |
|
"learning_rate": 4.5336851567770074e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5869322.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 196.68572082519532, |
|
"epoch": 0.5675, |
|
"grad_norm": 3.6616246314488534, |
|
"kl": 0.75234375, |
|
"learning_rate": 4.4914335540132204e-07, |
|
"loss": 0.0027, |
|
"num_tokens": 5893903.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 190.91429443359374, |
|
"epoch": 0.57, |
|
"grad_norm": 2.9722209992367854, |
|
"kl": 2.09140625, |
|
"learning_rate": 4.4492186153800284e-07, |
|
"loss": 0.004, |
|
"num_tokens": 5918505.0, |
|
"reward": 2.942857360839844, |
|
"reward_std": 0.09759000539779664, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 184.91429138183594, |
|
"epoch": 0.5725, |
|
"grad_norm": 0.06042948982632837, |
|
"kl": 0.8359375, |
|
"learning_rate": 4.407043384283136e-07, |
|
"loss": 0.0032, |
|
"num_tokens": 5942897.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 182.25715026855468, |
|
"epoch": 0.575, |
|
"grad_norm": 0.07756768631242807, |
|
"kl": 0.8734375, |
|
"learning_rate": 4.364910901265606e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 5967196.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 173.02857971191406, |
|
"epoch": 0.5775, |
|
"grad_norm": 0.9892808129257086, |
|
"kl": 1.546875, |
|
"learning_rate": 4.3228242037886687e-07, |
|
"loss": 0.0039, |
|
"num_tokens": 5990830.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 162.94286499023437, |
|
"epoch": 0.58, |
|
"grad_norm": 0.0578879268472676, |
|
"kl": 0.97578125, |
|
"learning_rate": 4.280786326012723e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6014434.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 165.42857666015624, |
|
"epoch": 0.5825, |
|
"grad_norm": 3.3348841449857387, |
|
"kl": 1.1046875, |
|
"learning_rate": 4.23880029857861e-07, |
|
"loss": 0.003, |
|
"num_tokens": 6038089.0, |
|
"reward": 2.914285945892334, |
|
"reward_std": 0.10690449476242066, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9142857193946838, |
|
"rewards/slop_reward": 1.0, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 163.71429138183595, |
|
"epoch": 0.585, |
|
"grad_norm": 3.485368879973419, |
|
"kl": 1.05703125, |
|
"learning_rate": 4.1968691483891133e-07, |
|
"loss": 0.003, |
|
"num_tokens": 6061739.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 163.00000915527343, |
|
"epoch": 0.5875, |
|
"grad_norm": 5.36230481850721, |
|
"kl": 1.00703125, |
|
"learning_rate": 4.154995898390755e-07, |
|
"loss": 0.002, |
|
"num_tokens": 6085364.0, |
|
"reward": 2.9397803783416747, |
|
"reward_std": 0.15932661443948745, |
|
"rewards/classifier_reward": 0.9969230651855469, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 177.0571502685547, |
|
"epoch": 0.59, |
|
"grad_norm": 3.3342971599613382, |
|
"kl": 1.04140625, |
|
"learning_rate": 4.1131835673558456e-07, |
|
"loss": 0.003, |
|
"num_tokens": 6109257.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 191.42857971191407, |
|
"epoch": 0.5925, |
|
"grad_norm": 0.08376084072008337, |
|
"kl": 1.02109375, |
|
"learning_rate": 4.0714351696648606e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6133846.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 192.08572692871093, |
|
"epoch": 0.595, |
|
"grad_norm": 0.046627278932137715, |
|
"kl": 0.98515625, |
|
"learning_rate": 4.029753715089123e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6158489.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 198.08572387695312, |
|
"epoch": 0.5975, |
|
"grad_norm": 11.565806887201926, |
|
"kl": 13.7140625, |
|
"learning_rate": 3.988142208573822e-07, |
|
"loss": 0.0161, |
|
"num_tokens": 6183159.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 197.31429443359374, |
|
"epoch": 0.6, |
|
"grad_norm": 0.05311856298193549, |
|
"kl": 1.02265625, |
|
"learning_rate": 3.94660365002137e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6207985.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 202.25715026855468, |
|
"epoch": 0.6025, |
|
"grad_norm": 4.782436637657736, |
|
"kl": 1.04921875, |
|
"learning_rate": 3.9051410340751346e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 6232984.0, |
|
"reward": 2.8857144832611086, |
|
"reward_std": 0.1824974000453949, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 1.0, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 193.6285827636719, |
|
"epoch": 0.605, |
|
"grad_norm": 3.232063907518827, |
|
"kl": 1.15859375, |
|
"learning_rate": 3.8637573499035503e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 6257629.0, |
|
"reward": 2.879447841644287, |
|
"reward_std": 0.11382801532745361, |
|
"rewards/classifier_reward": 0.9937332987785339, |
|
"rewards/length_reward": 0.8857142865657807, |
|
"rewards/slop_reward": 1.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 195.4571502685547, |
|
"epoch": 0.6075, |
|
"grad_norm": 0.06471530202926354, |
|
"kl": 1.05625, |
|
"learning_rate": 3.822455580984613e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6282207.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 274.0571594238281, |
|
"epoch": 0.61, |
|
"grad_norm": 3.0795183504620107, |
|
"kl": 1.50390625, |
|
"learning_rate": 3.781238704890792e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6309000.0, |
|
"reward": 2.783582401275635, |
|
"reward_std": 0.023816290497779845, |
|
"rewards/classifier_reward": 0.9835822105407714, |
|
"rewards/length_reward": 0.8, |
|
"rewards/slop_reward": 1.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 210.28572692871094, |
|
"epoch": 0.6125, |
|
"grad_norm": 3.0694884073186777, |
|
"kl": 1.0015625, |
|
"learning_rate": 3.7401096930743746e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 6334093.0, |
|
"reward": 2.9978450298309327, |
|
"reward_std": 0.005702095478773117, |
|
"rewards/classifier_reward": 0.9978448033332825, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 212.02857971191406, |
|
"epoch": 0.615, |
|
"grad_norm": 2.5150561307413764, |
|
"kl": 1.084375, |
|
"learning_rate": 3.699071510653235e-07, |
|
"loss": 0.003, |
|
"num_tokens": 6359434.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.0571533203125, |
|
"epoch": 0.6175, |
|
"grad_norm": 0.04173992686860701, |
|
"kl": 0.96875, |
|
"learning_rate": 3.6581271161970784e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6384975.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 221.91429443359374, |
|
"epoch": 0.62, |
|
"grad_norm": 0.11609659945057246, |
|
"kl": 1.05078125, |
|
"learning_rate": 3.6172794615141446e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6410642.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.8571533203125, |
|
"epoch": 0.6225, |
|
"grad_norm": 0.08717380831309861, |
|
"kl": 0.971875, |
|
"learning_rate": 3.5765314914384024e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6436607.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.22858276367188, |
|
"epoch": 0.625, |
|
"grad_norm": 0.06146613518625495, |
|
"kl": 0.99765625, |
|
"learning_rate": 3.535886143617248e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6462760.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.17144165039062, |
|
"epoch": 0.6275, |
|
"grad_norm": 3.705436703551293, |
|
"kl": 1.01953125, |
|
"learning_rate": 3.495346348299724e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 6488563.0, |
|
"reward": 2.9139774799346925, |
|
"reward_std": 0.1073996058665216, |
|
"rewards/classifier_reward": 0.9996915578842163, |
|
"rewards/length_reward": 0.9142857193946838, |
|
"rewards/slop_reward": 1.0, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.9714385986328, |
|
"epoch": 0.63, |
|
"grad_norm": 0.04691998567992384, |
|
"kl": 0.97890625, |
|
"learning_rate": 3.454915028125263e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6514734.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.02857971191406, |
|
"epoch": 0.6325, |
|
"grad_norm": 2.988060627858761, |
|
"kl": 1.0296875, |
|
"learning_rate": 3.4145950979129914e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 6540498.0, |
|
"reward": 2.99902081489563, |
|
"reward_std": 0.0025911811739206315, |
|
"rewards/classifier_reward": 0.9990206360816956, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 218.54286499023436, |
|
"epoch": 0.635, |
|
"grad_norm": 0.09098624238326748, |
|
"kl": 1.11640625, |
|
"learning_rate": 3.3743894644515824e-07, |
|
"loss": 0.0035, |
|
"num_tokens": 6565616.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 223.57143859863282, |
|
"epoch": 0.6375, |
|
"grad_norm": 0.07923298438659351, |
|
"kl": 0.95703125, |
|
"learning_rate": 3.334301026289712e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 6591361.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 219.00000610351563, |
|
"epoch": 0.64, |
|
"grad_norm": 3.44595488232409, |
|
"kl": 1.071875, |
|
"learning_rate": 3.294332673527076e-07, |
|
"loss": 0.003, |
|
"num_tokens": 6616850.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.80000915527344, |
|
"epoch": 0.6425, |
|
"grad_norm": 0.0704184343563711, |
|
"kl": 1.046875, |
|
"learning_rate": 3.254487287606044e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6642498.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.11429748535156, |
|
"epoch": 0.645, |
|
"grad_norm": 4.4341498316331425, |
|
"kl": 0.9890625, |
|
"learning_rate": 3.214767741103923e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 6668511.0, |
|
"reward": 2.950000190734863, |
|
"reward_std": 0.13228756189346313, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 227.11429443359376, |
|
"epoch": 0.6475, |
|
"grad_norm": 2.667970216501767, |
|
"kl": 1.121875, |
|
"learning_rate": 3.1751768975258743e-07, |
|
"loss": 0.003, |
|
"num_tokens": 6694380.0, |
|
"reward": 2.9997310638427734, |
|
"reward_std": 0.0007120789494365453, |
|
"rewards/classifier_reward": 0.9997308611869812, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.42858276367187, |
|
"epoch": 0.65, |
|
"grad_norm": 0.049266434577523735, |
|
"kl": 1.04921875, |
|
"learning_rate": 3.135717611098457e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6719910.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.97144165039063, |
|
"epoch": 0.6525, |
|
"grad_norm": 0.046460428834944396, |
|
"kl": 1.00546875, |
|
"learning_rate": 3.0963927265638734e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6745328.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.2571533203125, |
|
"epoch": 0.655, |
|
"grad_norm": 2.6813194889359324, |
|
"kl": 1.08125, |
|
"learning_rate": 3.0572050789748726e-07, |
|
"loss": 0.003, |
|
"num_tokens": 6771231.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 217.20001220703125, |
|
"epoch": 0.6575, |
|
"grad_norm": 2.8890479947848235, |
|
"kl": 1.16796875, |
|
"learning_rate": 3.018157493490374e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 6796753.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 233.51430053710936, |
|
"epoch": 0.66, |
|
"grad_norm": 2.802128759307218, |
|
"kl": 1.371875, |
|
"learning_rate": 2.9792527851717803e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 6822476.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 225.82858581542968, |
|
"epoch": 0.6625, |
|
"grad_norm": 0.06018626969663984, |
|
"kl": 1.04296875, |
|
"learning_rate": 2.940493758780037e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 6847958.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 214.571435546875, |
|
"epoch": 0.665, |
|
"grad_norm": 57.99972496539001, |
|
"kl": 2.54609375, |
|
"learning_rate": 2.9018832085734295e-07, |
|
"loss": 0.0045, |
|
"num_tokens": 6873054.0, |
|
"reward": 2.8571430683135985, |
|
"reward_std": 0.09759000539779664, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8571428596973419, |
|
"rewards/slop_reward": 1.0, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 222.40001220703124, |
|
"epoch": 0.6675, |
|
"grad_norm": 5.391908835054465, |
|
"kl": 1.19921875, |
|
"learning_rate": 2.863423918106138e-07, |
|
"loss": 0.0022, |
|
"num_tokens": 6898757.0, |
|
"reward": 2.9923308849334718, |
|
"reward_std": 0.02029096046462655, |
|
"rewards/classifier_reward": 0.9923307299613953, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 226.80001220703124, |
|
"epoch": 0.67, |
|
"grad_norm": 0.44063981643559547, |
|
"kl": 1.27265625, |
|
"learning_rate": 2.825118660027553e-07, |
|
"loss": 0.0037, |
|
"num_tokens": 6924550.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.80001220703124, |
|
"epoch": 0.6725, |
|
"grad_norm": 3.710195212366325, |
|
"kl": 1.1890625, |
|
"learning_rate": 2.786970195882398e-07, |
|
"loss": 0.0026, |
|
"num_tokens": 6950478.0, |
|
"reward": 2.9779660224914553, |
|
"reward_std": 0.058296956680715085, |
|
"rewards/classifier_reward": 0.9993943929672241, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 227.51430053710936, |
|
"epoch": 0.675, |
|
"grad_norm": 99.1686641289829, |
|
"kl": 79.6328125, |
|
"learning_rate": 2.748981275911633e-07, |
|
"loss": 0.0819, |
|
"num_tokens": 6976266.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 242.82857971191407, |
|
"epoch": 0.6775, |
|
"grad_norm": 4.778004765539873, |
|
"kl": 1.34140625, |
|
"learning_rate": 2.7111546388541896e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 7002514.0, |
|
"reward": 2.9389439105987547, |
|
"reward_std": 0.10570754185318947, |
|
"rewards/classifier_reward": 0.996086585521698, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.74286499023438, |
|
"epoch": 0.68, |
|
"grad_norm": 6.749318308021253, |
|
"kl": 1.54609375, |
|
"learning_rate": 2.673493011749513e-07, |
|
"loss": 0.003, |
|
"num_tokens": 7029221.0, |
|
"reward": 2.6523685693740844, |
|
"reward_std": 0.242851722240448, |
|
"rewards/classifier_reward": 0.9095112562179566, |
|
"rewards/length_reward": 0.7428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.02857971191406, |
|
"epoch": 0.6825, |
|
"grad_norm": 4.816456021644019, |
|
"kl": 1.69375, |
|
"learning_rate": 2.635999109740976e-07, |
|
"loss": 0.0027, |
|
"num_tokens": 7055228.0, |
|
"reward": 2.858464765548706, |
|
"reward_std": 0.17900042831897736, |
|
"rewards/classifier_reward": 0.9941788673400879, |
|
"rewards/length_reward": 0.8857142865657807, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.7714385986328, |
|
"epoch": 0.685, |
|
"grad_norm": 2.4842492940641647, |
|
"kl": 1.0171875, |
|
"learning_rate": 2.598675635880129e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 7081610.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.571435546875, |
|
"epoch": 0.6875, |
|
"grad_norm": 2.556011928350466, |
|
"kl": 1.15546875, |
|
"learning_rate": 2.561525280931828e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 7107408.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 231.6285827636719, |
|
"epoch": 0.69, |
|
"grad_norm": 0.08355624448786589, |
|
"kl": 1.053125, |
|
"learning_rate": 2.5245507231802486e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7133271.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.5428680419922, |
|
"epoch": 0.6925, |
|
"grad_norm": 3.2025322276348107, |
|
"kl": 1.4359375, |
|
"learning_rate": 2.487754628235805e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 7159353.0, |
|
"reward": 2.9962107658386232, |
|
"reward_std": 0.01002594456076622, |
|
"rewards/classifier_reward": 0.9962105512619018, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.65715942382812, |
|
"epoch": 0.695, |
|
"grad_norm": 0.18581262102933002, |
|
"kl": 1.25859375, |
|
"learning_rate": 2.4511396488429724e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 7185072.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.51429748535156, |
|
"epoch": 0.6975, |
|
"grad_norm": 4.453521381818444, |
|
"kl": 0.99765625, |
|
"learning_rate": 2.414708424689048e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 7210683.0, |
|
"reward": 2.882569408416748, |
|
"reward_std": 0.2035010576248169, |
|
"rewards/classifier_reward": 0.9397120952606202, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 223.31429443359374, |
|
"epoch": 0.7, |
|
"grad_norm": 0.1121570381593698, |
|
"kl": 1.2296875, |
|
"learning_rate": 2.378463582213842e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 7236399.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 217.60000915527343, |
|
"epoch": 0.7025, |
|
"grad_norm": 4.495034027794915, |
|
"kl": 0.9515625, |
|
"learning_rate": 2.3424077344203307e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 7261935.0, |
|
"reward": 2.99290018081665, |
|
"reward_std": 0.018784815073013307, |
|
"rewards/classifier_reward": 0.9929000020027161, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.11429748535156, |
|
"epoch": 0.705, |
|
"grad_norm": 0.5186142582429601, |
|
"kl": 1.41875, |
|
"learning_rate": 2.3065434806862805e-07, |
|
"loss": 0.0038, |
|
"num_tokens": 7287768.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 228.11429443359376, |
|
"epoch": 0.7075, |
|
"grad_norm": 2.654334342756894, |
|
"kl": 1.0203125, |
|
"learning_rate": 2.2708734065768486e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 7312659.0, |
|
"reward": 2.9993388175964357, |
|
"reward_std": 0.0017499331384897231, |
|
"rewards/classifier_reward": 0.9993385910987854, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 234.40001220703124, |
|
"epoch": 0.71, |
|
"grad_norm": 4.588949960274673, |
|
"kl": 1.12421875, |
|
"learning_rate": 2.2354000836581831e-07, |
|
"loss": 0.0021, |
|
"num_tokens": 7338617.0, |
|
"reward": 2.9408255100250242, |
|
"reward_std": 0.1565615115687251, |
|
"rewards/classifier_reward": 0.9908253073692321, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 231.6285827636719, |
|
"epoch": 0.7125, |
|
"grad_norm": 0.21793328593853228, |
|
"kl": 1.21484375, |
|
"learning_rate": 2.2001260693120232e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 7364198.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.08572998046876, |
|
"epoch": 0.715, |
|
"grad_norm": 2.616590063333536, |
|
"kl": 1.01640625, |
|
"learning_rate": 2.1650539065513412e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 7390479.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.97144165039063, |
|
"epoch": 0.7175, |
|
"grad_norm": 4.531348675554103, |
|
"kl": 1.19375, |
|
"learning_rate": 2.1301861238370016e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 7416732.0, |
|
"reward": 2.942857360839844, |
|
"reward_std": 0.09759000539779664, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9428571462631226, |
|
"rewards/slop_reward": 1.0, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 246.20001220703125, |
|
"epoch": 0.72, |
|
"grad_norm": 4.718898706696491, |
|
"kl": 1.49921875, |
|
"learning_rate": 2.0955252348954805e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7443268.0, |
|
"reward": 2.6439733505249023, |
|
"reward_std": 0.043535226583480836, |
|
"rewards/classifier_reward": 0.8439731419086456, |
|
"rewards/length_reward": 0.8, |
|
"rewards/slop_reward": 1.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 224.48572387695313, |
|
"epoch": 0.7225, |
|
"grad_norm": 0.04294906761504897, |
|
"kl": 1.00546875, |
|
"learning_rate": 2.0610737385376348e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7468990.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.77144470214844, |
|
"epoch": 0.725, |
|
"grad_norm": 0.13090449233770926, |
|
"kl": 1.19453125, |
|
"learning_rate": 2.026834118478567e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 7495162.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 222.57143859863282, |
|
"epoch": 0.7275, |
|
"grad_norm": 0.04429936575325668, |
|
"kl": 0.97734375, |
|
"learning_rate": 1.9928088431585589e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7520868.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.7428741455078, |
|
"epoch": 0.73, |
|
"grad_norm": 12.126930606187928, |
|
"kl": 10.5515625, |
|
"learning_rate": 1.959000365565119e-07, |
|
"loss": 0.0129, |
|
"num_tokens": 7546905.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 231.20001220703125, |
|
"epoch": 0.7325, |
|
"grad_norm": 1.1227825049824671, |
|
"kl": 1.8796875, |
|
"learning_rate": 1.925411123056128e-07, |
|
"loss": 0.0043, |
|
"num_tokens": 7572334.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 234.68572387695312, |
|
"epoch": 0.735, |
|
"grad_norm": 2.907894335014808, |
|
"kl": 1.3328125, |
|
"learning_rate": 1.8920435371841392e-07, |
|
"loss": 0.0032, |
|
"num_tokens": 7598444.0, |
|
"reward": 2.996587371826172, |
|
"reward_std": 0.009029625356197358, |
|
"rewards/classifier_reward": 0.9965871214866638, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 232.68572387695312, |
|
"epoch": 0.7375, |
|
"grad_norm": 0.10333954670768886, |
|
"kl": 1.1203125, |
|
"learning_rate": 1.858900013521788e-07, |
|
"loss": 0.0035, |
|
"num_tokens": 7624449.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 233.54286499023436, |
|
"epoch": 0.74, |
|
"grad_norm": 0.058548069484880644, |
|
"kl": 1.05859375, |
|
"learning_rate": 1.8259829414883725e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7650523.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.9428680419922, |
|
"epoch": 0.7425, |
|
"grad_norm": 3.1888591778113993, |
|
"kl": 1.009375, |
|
"learning_rate": 1.7932946941775878e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 7676533.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.5428680419922, |
|
"epoch": 0.745, |
|
"grad_norm": 0.10062491596572344, |
|
"kl": 1.0046875, |
|
"learning_rate": 1.7608376281864502e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7702619.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.51430053710936, |
|
"epoch": 0.7475, |
|
"grad_norm": 0.061386216937859915, |
|
"kl": 1.02578125, |
|
"learning_rate": 1.7286140834453954e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7729097.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 254.4571533203125, |
|
"epoch": 0.75, |
|
"grad_norm": 0.05751560507113229, |
|
"kl": 1.03984375, |
|
"learning_rate": 1.6966263830495935e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7755641.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.34287109375, |
|
"epoch": 0.7525, |
|
"grad_norm": 0.11118968278237154, |
|
"kl": 1.11484375, |
|
"learning_rate": 1.6648768330914576e-07, |
|
"loss": 0.0035, |
|
"num_tokens": 7781895.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.40000915527344, |
|
"epoch": 0.755, |
|
"grad_norm": 0.191477055644317, |
|
"kl": 1.0296875, |
|
"learning_rate": 1.6333677224944037e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 7808096.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 256.8571533203125, |
|
"epoch": 0.7575, |
|
"grad_norm": 2.9503904549206177, |
|
"kl": 0.984375, |
|
"learning_rate": 1.6021013228478275e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 7835006.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 246.02857971191406, |
|
"epoch": 0.76, |
|
"grad_norm": 7.0345652170290975, |
|
"kl": 1.221875, |
|
"learning_rate": 1.5710798882433428e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 7861536.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 253.68572387695312, |
|
"epoch": 0.7625, |
|
"grad_norm": 0.0476505587038927, |
|
"kl": 0.96328125, |
|
"learning_rate": 1.5403056551122694e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 7888255.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.88572387695314, |
|
"epoch": 0.765, |
|
"grad_norm": 0.04204135010045963, |
|
"kl": 0.89609375, |
|
"learning_rate": 1.5097808420644115e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 7914639.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.1428649902344, |
|
"epoch": 0.7675, |
|
"grad_norm": 0.07577358460759685, |
|
"kl": 0.9578125, |
|
"learning_rate": 1.479507649728105e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 7941362.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.9714385986328, |
|
"epoch": 0.77, |
|
"grad_norm": 0.04375804621751843, |
|
"kl": 0.95703125, |
|
"learning_rate": 1.4494882605915714e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 7967870.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 262.4571533203125, |
|
"epoch": 0.7725, |
|
"grad_norm": 3.881559830490846, |
|
"kl": 1.06953125, |
|
"learning_rate": 1.419724838845569e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 7994976.0, |
|
"reward": 2.7243717670440675, |
|
"reward_std": 0.2547113478183746, |
|
"rewards/classifier_reward": 0.9529429793357849, |
|
"rewards/length_reward": 0.7714285761117935, |
|
"rewards/slop_reward": 1.0, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 256.0857208251953, |
|
"epoch": 0.775, |
|
"grad_norm": 0.042854049785843215, |
|
"kl": 0.934375, |
|
"learning_rate": 1.3902195302273778e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 8021851.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 270.20001525878905, |
|
"epoch": 0.7775, |
|
"grad_norm": 5.120776391757703, |
|
"kl": 0.9703125, |
|
"learning_rate": 1.3609744618661013e-07, |
|
"loss": 0.0019, |
|
"num_tokens": 8049101.0, |
|
"reward": 2.8000001430511476, |
|
"reward_std": 0.28008740544319155, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8000000059604645, |
|
"rewards/slop_reward": 1.0, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 266.88572998046874, |
|
"epoch": 0.78, |
|
"grad_norm": 5.825629703539792, |
|
"kl": 1.01640625, |
|
"learning_rate": 1.331991742129318e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 8076201.0, |
|
"reward": 2.828571605682373, |
|
"reward_std": 0.1731828987598419, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8285714328289032, |
|
"rewards/slop_reward": 1.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.80001220703124, |
|
"epoch": 0.7825, |
|
"grad_norm": 2.2750579624967204, |
|
"kl": 0.90625, |
|
"learning_rate": 1.3032734604710783e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 8102845.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 258.34287109375, |
|
"epoch": 0.785, |
|
"grad_norm": 2.9657377944921426, |
|
"kl": 1.23046875, |
|
"learning_rate": 1.2748216872812745e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 8129806.0, |
|
"reward": 2.828571653366089, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8285714298486709, |
|
"rewards/slop_reward": 1.0, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.60000915527343, |
|
"epoch": 0.7875, |
|
"grad_norm": 0.04358713190283898, |
|
"kl": 0.9265625, |
|
"learning_rate": 1.2466384737363779e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 8156161.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 252.94286499023437, |
|
"epoch": 0.79, |
|
"grad_norm": 0.05992737661065536, |
|
"kl": 0.98046875, |
|
"learning_rate": 1.2187258516515642e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 8182699.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.91429443359374, |
|
"epoch": 0.7925, |
|
"grad_norm": 0.07122711325744925, |
|
"kl": 0.9984375, |
|
"learning_rate": 1.1910858333342277e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 8209296.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 250.11429748535156, |
|
"epoch": 0.795, |
|
"grad_norm": 3.4982043756301584, |
|
"kl": 0.98671875, |
|
"learning_rate": 1.1637204114389177e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 8235818.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.08572387695312, |
|
"epoch": 0.7975, |
|
"grad_norm": 0.04511891765892185, |
|
"kl": 0.93671875, |
|
"learning_rate": 1.1366315588236741e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 8262480.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 252.4571563720703, |
|
"epoch": 0.8, |
|
"grad_norm": 4.101920254520037, |
|
"kl": 0.98984375, |
|
"learning_rate": 1.1098212284078035e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 8289236.0, |
|
"reward": 2.8857144832611086, |
|
"reward_std": 0.19518001079559327, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.8857142925262451, |
|
"rewards/slop_reward": 1.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 248.91429748535157, |
|
"epoch": 0.8025, |
|
"grad_norm": 0.05706184089424821, |
|
"kl": 0.975, |
|
"learning_rate": 1.0832913530310783e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 8315716.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 250.31430358886718, |
|
"epoch": 0.805, |
|
"grad_norm": 0.07917877336969092, |
|
"kl": 1.02265625, |
|
"learning_rate": 1.0570438453144043e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 8342093.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 246.60001220703126, |
|
"epoch": 0.8075, |
|
"grad_norm": 0.09776954891266137, |
|
"kl": 1.1421875, |
|
"learning_rate": 1.0310805975219255e-07, |
|
"loss": 0.0035, |
|
"num_tokens": 8368479.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.85715026855468, |
|
"epoch": 0.81, |
|
"grad_norm": 0.05012984603712916, |
|
"kl": 0.96015625, |
|
"learning_rate": 1.0054034814246093e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 8394862.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.7714385986328, |
|
"epoch": 0.8125, |
|
"grad_norm": 0.2165819917911517, |
|
"kl": 1.35078125, |
|
"learning_rate": 9.800143481652979e-08, |
|
"loss": 0.0037, |
|
"num_tokens": 8421276.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 246.08572998046876, |
|
"epoch": 0.815, |
|
"grad_norm": 0.20309822089295643, |
|
"kl": 1.27265625, |
|
"learning_rate": 9.549150281252632e-08, |
|
"loss": 0.0037, |
|
"num_tokens": 8447767.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.6571533203125, |
|
"epoch": 0.8175, |
|
"grad_norm": 0.15578036977279336, |
|
"kl": 1.1515625, |
|
"learning_rate": 9.30107330792243e-08, |
|
"loss": 0.0035, |
|
"num_tokens": 8474150.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.4857208251953, |
|
"epoch": 0.82, |
|
"grad_norm": 0.043416576178703016, |
|
"kl": 0.9265625, |
|
"learning_rate": 9.055930446299914e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 8500171.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 234.57143859863282, |
|
"epoch": 0.8225, |
|
"grad_norm": 0.05061080629519643, |
|
"kl": 1.034375, |
|
"learning_rate": 8.813739369493395e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 8526178.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.28572692871094, |
|
"epoch": 0.825, |
|
"grad_norm": 0.04111158267935015, |
|
"kl": 0.94296875, |
|
"learning_rate": 8.574517537807896e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 8552519.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 242.00000915527343, |
|
"epoch": 0.8275, |
|
"grad_norm": 0.3393008711951354, |
|
"kl": 1.38515625, |
|
"learning_rate": 8.338282197486362e-08, |
|
"loss": 0.0038, |
|
"num_tokens": 8578538.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.71430053710938, |
|
"epoch": 0.83, |
|
"grad_norm": 5.920098814946804, |
|
"kl": 1.471875, |
|
"learning_rate": 8.105050379466332e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 8604935.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.34286499023438, |
|
"epoch": 0.8325, |
|
"grad_norm": 2.5733896209901923, |
|
"kl": 1.1671875, |
|
"learning_rate": 7.87483889815207e-08, |
|
"loss": 0.0031, |
|
"num_tokens": 8631504.0, |
|
"reward": 2.991787624359131, |
|
"reward_std": 0.02172858864068985, |
|
"rewards/classifier_reward": 0.9917873620986939, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.14287109375, |
|
"epoch": 0.835, |
|
"grad_norm": 2.715010668750814, |
|
"kl": 0.91875, |
|
"learning_rate": 7.64766435020246e-08, |
|
"loss": 0.0028, |
|
"num_tokens": 8657724.0, |
|
"reward": 2.989917850494385, |
|
"reward_std": 0.026675373315811157, |
|
"rewards/classifier_reward": 0.9899176597595215, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.05715637207032, |
|
"epoch": 0.8375, |
|
"grad_norm": 2.9706039903092303, |
|
"kl": 0.96171875, |
|
"learning_rate": 7.423543113334435e-08, |
|
"loss": 0.0029, |
|
"num_tokens": 8684291.0, |
|
"reward": 2.998081636428833, |
|
"reward_std": 0.005076154321432114, |
|
"rewards/classifier_reward": 0.9980813980102539, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.02857971191406, |
|
"epoch": 0.84, |
|
"grad_norm": 3.791827437250973, |
|
"kl": 0.978125, |
|
"learning_rate": 7.202491345142286e-08, |
|
"loss": 0.0029, |
|
"num_tokens": 8710997.0, |
|
"reward": 2.6675583362579345, |
|
"reward_std": 0.1484653353691101, |
|
"rewards/classifier_reward": 0.8389866888523102, |
|
"rewards/length_reward": 0.8285714298486709, |
|
"rewards/slop_reward": 1.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.48572692871093, |
|
"epoch": 0.8425, |
|
"grad_norm": 0.26642623759823414, |
|
"kl": 1.29609375, |
|
"learning_rate": 6.984524981932755e-08, |
|
"loss": 0.0037, |
|
"num_tokens": 8736817.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.2571533203125, |
|
"epoch": 0.845, |
|
"grad_norm": 0.05826256029366563, |
|
"kl": 0.9796875, |
|
"learning_rate": 6.769659737576227e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 8763338.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.00001525878906, |
|
"epoch": 0.8475, |
|
"grad_norm": 0.0820068561247779, |
|
"kl": 0.91484375, |
|
"learning_rate": 6.557911102373809e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 8789832.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.57143859863282, |
|
"epoch": 0.85, |
|
"grad_norm": 0.145851308869347, |
|
"kl": 1.04375, |
|
"learning_rate": 6.349294341940592e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 8816056.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.6571533203125, |
|
"epoch": 0.8525, |
|
"grad_norm": 4.893160591979278, |
|
"kl": 1.1859375, |
|
"learning_rate": 6.143824496105121e-08, |
|
"loss": 0.0031, |
|
"num_tokens": 8842714.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 245.80001220703124, |
|
"epoch": 0.855, |
|
"grad_norm": 0.06508578960217941, |
|
"kl": 1.04375, |
|
"learning_rate": 5.941516377825101e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 8869237.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.80000915527344, |
|
"epoch": 0.8575, |
|
"grad_norm": 4.184670329516298, |
|
"kl": 0.9828125, |
|
"learning_rate": 5.7423845721195184e-08, |
|
"loss": 0.0024, |
|
"num_tokens": 8895106.0, |
|
"reward": 2.950000190734863, |
|
"reward_std": 0.13228756189346313, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.22858276367188, |
|
"epoch": 0.86, |
|
"grad_norm": 0.04621548010483809, |
|
"kl": 0.96796875, |
|
"learning_rate": 5.546443435017145e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 8921496.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.71429443359375, |
|
"epoch": 0.8625, |
|
"grad_norm": 3.0589783495407725, |
|
"kl": 0.93515625, |
|
"learning_rate": 5.353707092521581e-08, |
|
"loss": 0.0028, |
|
"num_tokens": 8947905.0, |
|
"reward": 2.994158411026001, |
|
"reward_std": 0.01545594185590744, |
|
"rewards/classifier_reward": 0.9941581964492798, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.94286499023437, |
|
"epoch": 0.865, |
|
"grad_norm": 5.554678417760662, |
|
"kl": 3.01484375, |
|
"learning_rate": 5.16418943959282e-08, |
|
"loss": 0.0054, |
|
"num_tokens": 8974370.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.7714416503906, |
|
"epoch": 0.8675, |
|
"grad_norm": 0.0645376640043755, |
|
"kl": 0.915625, |
|
"learning_rate": 4.9779041391455775e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9000853.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 245.8571563720703, |
|
"epoch": 0.87, |
|
"grad_norm": 0.1578429426456495, |
|
"kl": 1.0453125, |
|
"learning_rate": 4.794864621064265e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 9027326.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.51429748535156, |
|
"epoch": 0.8725, |
|
"grad_norm": 0.09635841303768418, |
|
"kl": 1.05234375, |
|
"learning_rate": 4.615084081234799e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 9053438.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.80000915527344, |
|
"epoch": 0.875, |
|
"grad_norm": 9.025341176830713, |
|
"kl": 0.93828125, |
|
"learning_rate": 4.4385754805932095e-08, |
|
"loss": 0.0024, |
|
"num_tokens": 9079245.0, |
|
"reward": 2.9694100856781005, |
|
"reward_std": 0.0809337928891182, |
|
"rewards/classifier_reward": 0.9908384680747986, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.28572692871094, |
|
"epoch": 0.8775, |
|
"grad_norm": 3.7703349335107053, |
|
"kl": 1.09296875, |
|
"learning_rate": 4.2653515441913646e-08, |
|
"loss": 0.003, |
|
"num_tokens": 9105890.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 229.60000915527343, |
|
"epoch": 0.88, |
|
"grad_norm": 4.44121260340152, |
|
"kl": 0.9359375, |
|
"learning_rate": 4.095424760279453e-08, |
|
"loss": 0.0024, |
|
"num_tokens": 9131259.0, |
|
"reward": 2.8642487049102785, |
|
"reward_std": 0.13740314245224, |
|
"rewards/classifier_reward": 0.8642484605312347, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.171435546875, |
|
"epoch": 0.8825, |
|
"grad_norm": 4.315109786774216, |
|
"kl": 1.1671875, |
|
"learning_rate": 3.928807379405763e-08, |
|
"loss": 0.0026, |
|
"num_tokens": 9157515.0, |
|
"reward": 2.804906415939331, |
|
"reward_std": 0.016342369094491004, |
|
"rewards/classifier_reward": 0.8049062207341194, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 239.2571563720703, |
|
"epoch": 0.885, |
|
"grad_norm": 2.5953487008425773, |
|
"kl": 0.98125, |
|
"learning_rate": 3.7655114135334284e-08, |
|
"loss": 0.0029, |
|
"num_tokens": 9183753.0, |
|
"reward": 2.9993186473846434, |
|
"reward_std": 0.0018032947555184364, |
|
"rewards/classifier_reward": 0.9993184208869934, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.91429748535157, |
|
"epoch": 0.8875, |
|
"grad_norm": 3.061039239899815, |
|
"kl": 0.87109375, |
|
"learning_rate": 3.6055486351745324e-08, |
|
"loss": 0.0028, |
|
"num_tokens": 9210003.0, |
|
"reward": 2.9983937740325928, |
|
"reward_std": 0.004250280186533928, |
|
"rewards/classifier_reward": 0.9983935475349426, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 248.65715637207032, |
|
"epoch": 0.89, |
|
"grad_norm": 0.07280216811639449, |
|
"kl": 1.0234375, |
|
"learning_rate": 3.448930576541309e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 9236471.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.571435546875, |
|
"epoch": 0.8925, |
|
"grad_norm": 0.04031184293990925, |
|
"kl": 0.85625, |
|
"learning_rate": 3.295668528714801e-08, |
|
"loss": 0.0032, |
|
"num_tokens": 9262896.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.0571533203125, |
|
"epoch": 0.895, |
|
"grad_norm": 0.048980473886171605, |
|
"kl": 0.90390625, |
|
"learning_rate": 3.145773540830815e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9289145.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.48572692871093, |
|
"epoch": 0.8975, |
|
"grad_norm": 0.06968257994497579, |
|
"kl": 0.9921875, |
|
"learning_rate": 2.9992564192834246e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 9315727.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 245.34286804199218, |
|
"epoch": 0.9, |
|
"grad_norm": 0.05048579158042856, |
|
"kl": 0.965625, |
|
"learning_rate": 2.8561277269457895e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 9342091.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.94287109375, |
|
"epoch": 0.9025, |
|
"grad_norm": 0.07338683112545501, |
|
"kl": 0.9578125, |
|
"learning_rate": 2.7163977824087692e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9368444.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 245.48572387695313, |
|
"epoch": 0.905, |
|
"grad_norm": 2.0759195339474985, |
|
"kl": 2.634375, |
|
"learning_rate": 2.5800766592369073e-08, |
|
"loss": 0.005, |
|
"num_tokens": 9394956.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.1428649902344, |
|
"epoch": 0.9075, |
|
"grad_norm": 0.071979853158223, |
|
"kl": 0.9921875, |
|
"learning_rate": 2.4471741852423233e-08, |
|
"loss": 0.0034, |
|
"num_tokens": 9421111.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.60001525878906, |
|
"epoch": 0.91, |
|
"grad_norm": 4.112340756036376, |
|
"kl": 1.06640625, |
|
"learning_rate": 2.3176999417760633e-08, |
|
"loss": 0.003, |
|
"num_tokens": 9447147.0, |
|
"reward": 2.997753620147705, |
|
"reward_std": 0.005943871289491654, |
|
"rewards/classifier_reward": 0.9977534294128418, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.65715637207032, |
|
"epoch": 0.9125, |
|
"grad_norm": 0.1514075169088936, |
|
"kl": 1.1046875, |
|
"learning_rate": 2.1916632630374577e-08, |
|
"loss": 0.0035, |
|
"num_tokens": 9473085.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.74286804199218, |
|
"epoch": 0.915, |
|
"grad_norm": 0.18737733575482768, |
|
"kl": 1.07578125, |
|
"learning_rate": 2.0690732354011088e-08, |
|
"loss": 0.0035, |
|
"num_tokens": 9499431.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.6571533203125, |
|
"epoch": 0.9175, |
|
"grad_norm": 0.04304702113082608, |
|
"kl": 0.92421875, |
|
"learning_rate": 1.9499386967619104e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9525433.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 234.6571502685547, |
|
"epoch": 0.92, |
|
"grad_norm": 2.545151028403328, |
|
"kl": 0.9734375, |
|
"learning_rate": 1.8342682358978068e-08, |
|
"loss": 0.0029, |
|
"num_tokens": 9551354.0, |
|
"reward": 2.9982373237609865, |
|
"reward_std": 0.004664153978228569, |
|
"rewards/classifier_reward": 0.9982371211051941, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.11429443359376, |
|
"epoch": 0.9225, |
|
"grad_norm": 0.04345300022978855, |
|
"kl": 0.9125, |
|
"learning_rate": 1.7220701918506662e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9577818.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.74287109375, |
|
"epoch": 0.925, |
|
"grad_norm": 3.5704547668159674, |
|
"kl": 1.284375, |
|
"learning_rate": 1.6133526533250563e-08, |
|
"loss": 0.0032, |
|
"num_tokens": 9604325.0, |
|
"reward": 2.9714287757873534, |
|
"reward_std": 0.07559289336204529, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 0.9714285731315613, |
|
"rewards/slop_reward": 1.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.11429748535156, |
|
"epoch": 0.9275, |
|
"grad_norm": 0.041173535714356384, |
|
"kl": 0.88828125, |
|
"learning_rate": 1.5081234581051482e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9630168.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.8571533203125, |
|
"epoch": 0.93, |
|
"grad_norm": 0.08243253183651265, |
|
"kl": 0.934375, |
|
"learning_rate": 1.4063901924895982e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9656156.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.1428649902344, |
|
"epoch": 0.9325, |
|
"grad_norm": 0.05082406017565479, |
|
"kl": 0.94921875, |
|
"learning_rate": 1.3081601907447004e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9682368.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 242.80001220703124, |
|
"epoch": 0.935, |
|
"grad_norm": 2.8407093679191733, |
|
"kl": 1.04453125, |
|
"learning_rate": 1.2134405345755772e-08, |
|
"loss": 0.003, |
|
"num_tokens": 9708489.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 248.08572692871093, |
|
"epoch": 0.9375, |
|
"grad_norm": 0.14511834175077776, |
|
"kl": 1.07421875, |
|
"learning_rate": 1.1222380526156927e-08, |
|
"loss": 0.0035, |
|
"num_tokens": 9734849.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.48572692871093, |
|
"epoch": 0.94, |
|
"grad_norm": 0.042968063146662946, |
|
"kl": 0.91875, |
|
"learning_rate": 1.034559319934497e-08, |
|
"loss": 0.0033, |
|
"num_tokens": 9761221.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.17144165039062, |
|
"epoch": 0.9425, |
|
"grad_norm": 3.168647847961566, |
|
"kl": 1.0890625, |
|
"learning_rate": 9.504106575634663e-09, |
|
"loss": 0.003, |
|
"num_tokens": 9787372.0, |
|
"reward": 2.965010404586792, |
|
"reward_std": 0.06012881994247436, |
|
"rewards/classifier_reward": 0.9864387512207031, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.91429748535157, |
|
"epoch": 0.945, |
|
"grad_norm": 0.04526250685823323, |
|
"kl": 0.99375, |
|
"learning_rate": 8.697981320403336e-09, |
|
"loss": 0.0034, |
|
"num_tokens": 9813619.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 246.20001220703125, |
|
"epoch": 0.9475, |
|
"grad_norm": 4.273461002843142, |
|
"kl": 0.9484375, |
|
"learning_rate": 7.927275549718226e-09, |
|
"loss": 0.0019, |
|
"num_tokens": 9839751.0, |
|
"reward": 2.9877804279327393, |
|
"reward_std": 0.03233038559556008, |
|
"rewards/classifier_reward": 0.9877802491188049, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.8571533203125, |
|
"epoch": 0.95, |
|
"grad_norm": 0.20002216578085852, |
|
"kl": 1.165625, |
|
"learning_rate": 7.1920448261457715e-09, |
|
"loss": 0.0035, |
|
"num_tokens": 9866244.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.60000915527343, |
|
"epoch": 0.9525, |
|
"grad_norm": 0.0834845425154776, |
|
"kl": 1.04375, |
|
"learning_rate": 6.492342154746588e-09, |
|
"loss": 0.0034, |
|
"num_tokens": 9892505.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.31429748535157, |
|
"epoch": 0.955, |
|
"grad_norm": 2.52753839477545, |
|
"kl": 1.11171875, |
|
"learning_rate": 5.828217979253869e-09, |
|
"loss": 0.003, |
|
"num_tokens": 9918307.0, |
|
"reward": 2.9943798542022706, |
|
"reward_std": 0.014869998395442962, |
|
"rewards/classifier_reward": 0.9943796753883362, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.571435546875, |
|
"epoch": 0.9575, |
|
"grad_norm": 3.128240693093393, |
|
"kl": 0.98359375, |
|
"learning_rate": 5.1997201784368395e-09, |
|
"loss": 0.0029, |
|
"num_tokens": 9944374.0, |
|
"reward": 2.995487594604492, |
|
"reward_std": 0.011939284205436707, |
|
"rewards/classifier_reward": 0.995487380027771, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.9428680419922, |
|
"epoch": 0.96, |
|
"grad_norm": 3.5538024950839295, |
|
"kl": 1.0, |
|
"learning_rate": 4.606894062648969e-09, |
|
"loss": 0.0024, |
|
"num_tokens": 9970727.0, |
|
"reward": 2.977288818359375, |
|
"reward_std": 0.06008877456188202, |
|
"rewards/classifier_reward": 0.9987171411514282, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.6571502685547, |
|
"epoch": 0.9625, |
|
"grad_norm": 0.1991534915494115, |
|
"kl": 1.184375, |
|
"learning_rate": 4.049782370561583e-09, |
|
"loss": 0.0036, |
|
"num_tokens": 9997070.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.34286804199218, |
|
"epoch": 0.965, |
|
"grad_norm": 0.05753950305238813, |
|
"kl": 0.99375, |
|
"learning_rate": 3.5284252660823244e-09, |
|
"loss": 0.0034, |
|
"num_tokens": 10023227.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.6571533203125, |
|
"epoch": 0.9675, |
|
"grad_norm": 0.09059748000607612, |
|
"kl": 1.003125, |
|
"learning_rate": 3.0428603354600844e-09, |
|
"loss": 0.0034, |
|
"num_tokens": 10049465.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 254.54286499023436, |
|
"epoch": 0.97, |
|
"grad_norm": 0.12475756025548783, |
|
"kl": 1.07890625, |
|
"learning_rate": 2.5931225845748917e-09, |
|
"loss": 0.0035, |
|
"num_tokens": 10076294.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 242.6571533203125, |
|
"epoch": 0.9725, |
|
"grad_norm": 0.4783333308376495, |
|
"kl": 1.3875, |
|
"learning_rate": 2.1792444364144847e-09, |
|
"loss": 0.0038, |
|
"num_tokens": 10102419.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.34286804199218, |
|
"epoch": 0.975, |
|
"grad_norm": 0.08716408529439204, |
|
"kl": 0.93125, |
|
"learning_rate": 1.8012557287367391e-09, |
|
"loss": 0.0033, |
|
"num_tokens": 10128786.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 241.00000915527343, |
|
"epoch": 0.9775, |
|
"grad_norm": 3.1645439252715826, |
|
"kl": 0.9140625, |
|
"learning_rate": 1.4591837119186102e-09, |
|
"loss": 0.0028, |
|
"num_tokens": 10155141.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.28572692871094, |
|
"epoch": 0.98, |
|
"grad_norm": 0.41518239333595675, |
|
"kl": 1.1234375, |
|
"learning_rate": 1.1530530469914256e-09, |
|
"loss": 0.0035, |
|
"num_tokens": 10181268.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.7714385986328, |
|
"epoch": 0.9825, |
|
"grad_norm": 0.040861855208901524, |
|
"kl": 0.878125, |
|
"learning_rate": 8.828858038632536e-10, |
|
"loss": 0.0033, |
|
"num_tokens": 10207349.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 249.9714385986328, |
|
"epoch": 0.985, |
|
"grad_norm": 0.12170276230850544, |
|
"kl": 1.1421875, |
|
"learning_rate": 6.48701459727563e-10, |
|
"loss": 0.0035, |
|
"num_tokens": 10233544.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.00001220703126, |
|
"epoch": 0.9875, |
|
"grad_norm": 0.04242721259201106, |
|
"kl": 0.93828125, |
|
"learning_rate": 4.5051689765929213e-10, |
|
"loss": 0.0033, |
|
"num_tokens": 10259969.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 248.68572387695312, |
|
"epoch": 0.99, |
|
"grad_norm": 2.430119389458536, |
|
"kl": 0.96875, |
|
"learning_rate": 2.883464053973772e-10, |
|
"loss": 0.0029, |
|
"num_tokens": 10286476.0, |
|
"reward": 2.9996359825134276, |
|
"reward_std": 0.0009637950919568538, |
|
"rewards/classifier_reward": 0.9996357202529907, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.34286499023438, |
|
"epoch": 0.9925, |
|
"grad_norm": 2.4946704769049948, |
|
"kl": 1.090625, |
|
"learning_rate": 1.6220167431502118e-10, |
|
"loss": 0.003, |
|
"num_tokens": 10312797.0, |
|
"reward": 2.998571014404297, |
|
"reward_std": 0.0037812769412994387, |
|
"rewards/classifier_reward": 0.9985708117485046, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.60001220703126, |
|
"epoch": 0.995, |
|
"grad_norm": 0.057929252615434516, |
|
"kl": 0.93359375, |
|
"learning_rate": 7.209179857675663e-11, |
|
"loss": 0.0033, |
|
"num_tokens": 10338970.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.6285827636719, |
|
"epoch": 0.9975, |
|
"grad_norm": 0.06529828900649211, |
|
"kl": 1.0125, |
|
"learning_rate": 1.8023274482636965e-11, |
|
"loss": 0.0034, |
|
"num_tokens": 10365242.0, |
|
"reward": 3.000000238418579, |
|
"reward_std": 0.0, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 1.0, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 239.8800018310547, |
|
"epoch": 1.0, |
|
"grad_norm": 2.484762868251253, |
|
"kl": 0.875, |
|
"learning_rate": 0.0, |
|
"loss": 0.0028, |
|
"num_tokens": 10384026.0, |
|
"reward": 2.978571653366089, |
|
"reward_std": 0.056694668531417844, |
|
"rewards/classifier_reward": 1.0, |
|
"rewards/length_reward": 1.0, |
|
"rewards/slop_reward": 0.9785714268684387, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|