Lansechen's picture
Model save
1571795 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9925373134328357,
"eval_steps": 100,
"global_step": 266,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 593.2823944091797,
"epoch": 0.007462686567164179,
"grad_norm": 0.5037462115287781,
"learning_rate": 3.7037037037037036e-08,
"loss": 0.2424,
"num_tokens": 667405.0,
"reward": 0.18871622439473867,
"reward_std": 0.5178131051361561,
"rewards/accuracy_reward": 0.13169642724096775,
"rewards/cosine_scaled_reward": 0.00010013708379119635,
"rewards/format_reward": 0.05691964435391128,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 629.1518173217773,
"epoch": 0.014925373134328358,
"grad_norm": 0.7031348943710327,
"learning_rate": 7.407407407407407e-08,
"loss": 0.2401,
"num_tokens": 1365053.0,
"reward": 0.20235019456595182,
"reward_std": 0.5161089487373829,
"rewards/accuracy_reward": 0.13616071455180645,
"rewards/cosine_scaled_reward": -0.006355166085995734,
"rewards/format_reward": 0.0725446434225887,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 579.0893096923828,
"epoch": 0.022388059701492536,
"grad_norm": 0.6601409316062927,
"learning_rate": 1.111111111111111e-07,
"loss": 0.2374,
"num_tokens": 2014861.0,
"reward": 0.2147554385010153,
"reward_std": 0.5122785679996014,
"rewards/accuracy_reward": 0.14174107275903225,
"rewards/cosine_scaled_reward": 0.008282212191261351,
"rewards/format_reward": 0.06473214365541935,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 565.4654350280762,
"epoch": 0.029850746268656716,
"grad_norm": 0.4544272720813751,
"learning_rate": 1.4814814814814815e-07,
"loss": 0.2732,
"num_tokens": 2647950.0,
"reward": 0.2221650118008256,
"reward_std": 0.5312090590596199,
"rewards/accuracy_reward": 0.14062500093132257,
"rewards/cosine_scaled_reward": 0.01792393707728479,
"rewards/format_reward": 0.06361607159487903,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 537.1294937133789,
"epoch": 0.03731343283582089,
"grad_norm": 0.4951060116291046,
"learning_rate": 1.8518518518518516e-07,
"loss": 0.2245,
"num_tokens": 3266122.0,
"reward": 0.2845571478828788,
"reward_std": 0.5756800286471844,
"rewards/accuracy_reward": 0.16183035727590322,
"rewards/cosine_scaled_reward": 0.05464643065351993,
"rewards/format_reward": 0.06808035844005644,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 595.4140930175781,
"epoch": 0.04477611940298507,
"grad_norm": 0.43293312191963196,
"learning_rate": 2.222222222222222e-07,
"loss": 0.2237,
"num_tokens": 3928245.0,
"reward": 0.22057450748980045,
"reward_std": 0.5795701257884502,
"rewards/accuracy_reward": 0.1395089291036129,
"rewards/cosine_scaled_reward": 0.011869142268551514,
"rewards/format_reward": 0.06919642933644354,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 557.8248062133789,
"epoch": 0.05223880597014925,
"grad_norm": 0.650478720664978,
"learning_rate": 2.5925925925925923e-07,
"loss": 0.2528,
"num_tokens": 4550280.0,
"reward": 0.23954601865261793,
"reward_std": 0.5432833544909954,
"rewards/accuracy_reward": 0.1372767877765,
"rewards/cosine_scaled_reward": 0.01856386021245271,
"rewards/format_reward": 0.08370535750873387,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 560.6506958007812,
"epoch": 0.05970149253731343,
"grad_norm": 0.46624094247817993,
"learning_rate": 2.962962962962963e-07,
"loss": 0.2368,
"num_tokens": 5196015.0,
"reward": 0.23683909513056278,
"reward_std": 0.5061681233346462,
"rewards/accuracy_reward": 0.15178571362048388,
"rewards/cosine_scaled_reward": 0.024785520159639418,
"rewards/format_reward": 0.06026785750873387,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 571.0123062133789,
"epoch": 0.06716417910447761,
"grad_norm": 0.533889889717102,
"learning_rate": 3.333333333333333e-07,
"loss": 0.2392,
"num_tokens": 5835498.0,
"reward": 0.23420938570052385,
"reward_std": 0.5464016310870647,
"rewards/accuracy_reward": 0.13839285681024194,
"rewards/cosine_scaled_reward": 0.023271879297681153,
"rewards/format_reward": 0.07254464412108064,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 573.4230194091797,
"epoch": 0.07462686567164178,
"grad_norm": 8.411685943603516,
"learning_rate": 3.703703703703703e-07,
"loss": 0.2067,
"num_tokens": 6479685.0,
"reward": 0.28206104040145874,
"reward_std": 0.5646266750991344,
"rewards/accuracy_reward": 0.16406249906867743,
"rewards/cosine_scaled_reward": 0.03429317264817655,
"rewards/format_reward": 0.08370535681024194,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 545.127254486084,
"epoch": 0.08208955223880597,
"grad_norm": 0.9542278051376343,
"learning_rate": 4.0740740740740737e-07,
"loss": 0.1322,
"num_tokens": 7103751.0,
"reward": 0.31405315548181534,
"reward_std": 0.6051793843507767,
"rewards/accuracy_reward": 0.1629464291036129,
"rewards/cosine_scaled_reward": 0.04619600536534563,
"rewards/format_reward": 0.10491071362048388,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 613.8381958007812,
"epoch": 0.08955223880597014,
"grad_norm": 0.5434120297431946,
"learning_rate": 4.444444444444444e-07,
"loss": 0.2048,
"num_tokens": 7791062.0,
"reward": 0.26040036062477157,
"reward_std": 0.5332776308059692,
"rewards/accuracy_reward": 0.14174107008147985,
"rewards/cosine_scaled_reward": 0.015980710508301854,
"rewards/format_reward": 0.10267857136204839,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 515.9442176818848,
"epoch": 0.09701492537313433,
"grad_norm": 0.5895915031433105,
"learning_rate": 4.814814814814814e-07,
"loss": 0.1288,
"num_tokens": 8388340.0,
"reward": 0.40466225892305374,
"reward_std": 0.6555211395025253,
"rewards/accuracy_reward": 0.1919642873108387,
"rewards/cosine_scaled_reward": 0.07988545499392785,
"rewards/format_reward": 0.1328124995343387,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 584.5747985839844,
"epoch": 0.1044776119402985,
"grad_norm": 1.4778566360473633,
"learning_rate": 5.185185185185185e-07,
"loss": 0.1507,
"num_tokens": 9054239.0,
"reward": 0.2996965404599905,
"reward_std": 0.5809138379991055,
"rewards/accuracy_reward": 0.11941964272409678,
"rewards/cosine_scaled_reward": -0.00722311669960618,
"rewards/format_reward": 0.1875,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 620.6741371154785,
"epoch": 0.11194029850746269,
"grad_norm": 2.612856149673462,
"learning_rate": 5.555555555555555e-07,
"loss": 0.2128,
"num_tokens": 9741539.0,
"reward": 0.34804879780858755,
"reward_std": 0.6096060052514076,
"rewards/accuracy_reward": 0.14062499906867743,
"rewards/cosine_scaled_reward": 0.010995208285748959,
"rewards/format_reward": 0.19642856996506453,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 603.7500228881836,
"epoch": 0.11940298507462686,
"grad_norm": 1.191655158996582,
"learning_rate": 5.925925925925926e-07,
"loss": 0.1818,
"num_tokens": 10413043.0,
"reward": 0.3713220842182636,
"reward_std": 0.639706090092659,
"rewards/accuracy_reward": 0.13950893003493547,
"rewards/cosine_scaled_reward": 0.011947068211156875,
"rewards/format_reward": 0.2198660708963871,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 544.5558242797852,
"epoch": 0.12686567164179105,
"grad_norm": 0.6938550472259521,
"learning_rate": 6.296296296296296e-07,
"loss": 0.085,
"num_tokens": 11037421.0,
"reward": 0.5226609222590923,
"reward_std": 0.7315020114183426,
"rewards/accuracy_reward": 0.17410713899880648,
"rewards/cosine_scaled_reward": 0.05502696509938687,
"rewards/format_reward": 0.2935267835855484,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 571.3337364196777,
"epoch": 0.13432835820895522,
"grad_norm": 1.5410027503967285,
"learning_rate": 6.666666666666666e-07,
"loss": 0.1118,
"num_tokens": 11676216.0,
"reward": 0.591816034168005,
"reward_std": 0.7450486496090889,
"rewards/accuracy_reward": 0.17633928451687098,
"rewards/cosine_scaled_reward": 0.05163742566946894,
"rewards/format_reward": 0.3638392873108387,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 549.0669898986816,
"epoch": 0.1417910447761194,
"grad_norm": 0.5230954885482788,
"learning_rate": 7.037037037037037e-07,
"loss": 0.0718,
"num_tokens": 12316516.0,
"reward": 0.7351889088749886,
"reward_std": 0.7607561945915222,
"rewards/accuracy_reward": 0.1893028812482953,
"rewards/cosine_scaled_reward": 0.08228707825765014,
"rewards/format_reward": 0.4654017835855484,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 553.9040451049805,
"epoch": 0.14925373134328357,
"grad_norm": 1.0374072790145874,
"learning_rate": 7.407407407407406e-07,
"loss": 0.0991,
"num_tokens": 12945302.0,
"reward": 0.6941376700997353,
"reward_std": 0.730622187256813,
"rewards/accuracy_reward": 0.13281249720603228,
"rewards/cosine_scaled_reward": 0.027843003364978358,
"rewards/format_reward": 0.5334821417927742,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 513.1774749755859,
"epoch": 0.15671641791044777,
"grad_norm": 0.46707218885421753,
"learning_rate": 7.777777777777778e-07,
"loss": 0.0643,
"num_tokens": 13534109.0,
"reward": 0.8666251823306084,
"reward_std": 0.7560148313641548,
"rewards/accuracy_reward": 0.1741071422584355,
"rewards/cosine_scaled_reward": 0.07198226451873779,
"rewards/format_reward": 0.6205357164144516,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 577.2232398986816,
"epoch": 0.16417910447761194,
"grad_norm": 0.7016672492027283,
"learning_rate": 8.148148148148147e-07,
"loss": 0.1127,
"num_tokens": 14194037.0,
"reward": 0.8435313403606415,
"reward_std": 0.7020122557878494,
"rewards/accuracy_reward": 0.14174107182770967,
"rewards/cosine_scaled_reward": 0.02098666892379697,
"rewards/format_reward": 0.6808035746216774,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 502.9531440734863,
"epoch": 0.17164179104477612,
"grad_norm": 0.35482147336006165,
"learning_rate": 8.518518518518518e-07,
"loss": 0.1307,
"num_tokens": 14768411.0,
"reward": 1.1029141992330551,
"reward_std": 0.7098172605037689,
"rewards/accuracy_reward": 0.22544642724096775,
"rewards/cosine_scaled_reward": 0.11742306314408779,
"rewards/format_reward": 0.7600446492433548,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 512.8035926818848,
"epoch": 0.1791044776119403,
"grad_norm": 0.32759323716163635,
"learning_rate": 8.888888888888888e-07,
"loss": 0.0876,
"num_tokens": 15351427.0,
"reward": 1.149243749678135,
"reward_std": 0.7510530278086662,
"rewards/accuracy_reward": 0.21205356903374195,
"rewards/cosine_scaled_reward": 0.1124133332632482,
"rewards/format_reward": 0.8247767984867096,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 535.4419860839844,
"epoch": 0.1865671641791045,
"grad_norm": 0.3772229850292206,
"learning_rate": 9.259259259259259e-07,
"loss": 0.1594,
"num_tokens": 15974879.0,
"reward": 1.1618424132466316,
"reward_std": 0.6563375778496265,
"rewards/accuracy_reward": 0.20535714086145163,
"rewards/cosine_scaled_reward": 0.09934233513195068,
"rewards/format_reward": 0.8571428582072258,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 557.8861923217773,
"epoch": 0.19402985074626866,
"grad_norm": 0.291864275932312,
"learning_rate": 9.629629629629628e-07,
"loss": 0.0932,
"num_tokens": 16604401.0,
"reward": 1.2150916159152985,
"reward_std": 0.7205987647175789,
"rewards/accuracy_reward": 0.22656249813735485,
"rewards/cosine_scaled_reward": 0.12022545811487362,
"rewards/format_reward": 0.8683035597205162,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 512.9129600524902,
"epoch": 0.20149253731343283,
"grad_norm": 0.31357285380363464,
"learning_rate": 1e-06,
"loss": 0.1004,
"num_tokens": 17195475.0,
"reward": 1.3348890244960785,
"reward_std": 0.6589159071445465,
"rewards/accuracy_reward": 0.26897321455180645,
"rewards/cosine_scaled_reward": 0.1685942793264985,
"rewards/format_reward": 0.8973214253783226,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 532.2611846923828,
"epoch": 0.208955223880597,
"grad_norm": 0.3403704762458801,
"learning_rate": 9.999575185316993e-07,
"loss": 0.1619,
"num_tokens": 17811437.0,
"reward": 1.2805243134498596,
"reward_std": 0.6447809338569641,
"rewards/accuracy_reward": 0.24441963993012905,
"rewards/cosine_scaled_reward": 0.12873857002705336,
"rewards/format_reward": 0.9073660746216774,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 506.21988677978516,
"epoch": 0.21641791044776118,
"grad_norm": 0.35326462984085083,
"learning_rate": 9.99830081345498e-07,
"loss": 0.1134,
"num_tokens": 18408722.0,
"reward": 1.336455225944519,
"reward_std": 0.6456731334328651,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": 0.15565160103142262,
"rewards/format_reward": 0.9308035746216774,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 503.8482475280762,
"epoch": 0.22388059701492538,
"grad_norm": 0.26988422870635986,
"learning_rate": 9.996177100962712e-07,
"loss": 0.0995,
"num_tokens": 18986002.0,
"reward": 1.4584019258618355,
"reward_std": 0.6846916638314724,
"rewards/accuracy_reward": 0.315848215483129,
"rewards/cosine_scaled_reward": 0.21733042690902948,
"rewards/format_reward": 0.9252232164144516,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 471.99778747558594,
"epoch": 0.23134328358208955,
"grad_norm": 0.3263660669326782,
"learning_rate": 9.99320440871389e-07,
"loss": 0.1279,
"num_tokens": 19548200.0,
"reward": 1.4953693896532059,
"reward_std": 0.7131579741835594,
"rewards/accuracy_reward": 0.32142856903374195,
"rewards/cosine_scaled_reward": 0.2308604083955288,
"rewards/format_reward": 0.9430803582072258,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 494.79801177978516,
"epoch": 0.23880597014925373,
"grad_norm": 0.32770803570747375,
"learning_rate": 9.989383241845837e-07,
"loss": 0.0804,
"num_tokens": 20116083.0,
"reward": 1.599047303199768,
"reward_std": 0.754006952047348,
"rewards/accuracy_reward": 0.3761160746216774,
"rewards/cosine_scaled_reward": 0.28208293952047825,
"rewards/format_reward": 0.9408482015132904,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 531.2064971923828,
"epoch": 0.2462686567164179,
"grad_norm": 0.27149882912635803,
"learning_rate": 9.984714249673673e-07,
"loss": 0.1024,
"num_tokens": 20746676.0,
"reward": 1.6310840100049973,
"reward_std": 0.6816431954503059,
"rewards/accuracy_reward": 0.3861607164144516,
"rewards/cosine_scaled_reward": 0.27952144481241703,
"rewards/format_reward": 0.9654017835855484,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 514.7946662902832,
"epoch": 0.2537313432835821,
"grad_norm": 0.27265796065330505,
"learning_rate": 9.979198225579968e-07,
"loss": 0.1376,
"num_tokens": 21335188.0,
"reward": 1.7159467786550522,
"reward_std": 0.6568828374147415,
"rewards/accuracy_reward": 0.4274553544819355,
"rewards/cosine_scaled_reward": 0.3186252359300852,
"rewards/format_reward": 0.9698660746216774,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 546.0234527587891,
"epoch": 0.26119402985074625,
"grad_norm": 0.2513149380683899,
"learning_rate": 9.972836106879934e-07,
"loss": 0.1169,
"num_tokens": 21950753.0,
"reward": 1.686128944158554,
"reward_std": 0.6522306874394417,
"rewards/accuracy_reward": 0.4196428582072258,
"rewards/cosine_scaled_reward": 0.316709216684103,
"rewards/format_reward": 0.9497767835855484,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 481.2410888671875,
"epoch": 0.26865671641791045,
"grad_norm": 0.2620702385902405,
"learning_rate": 9.965628974662144e-07,
"loss": 0.1147,
"num_tokens": 22503649.0,
"reward": 1.9226552546024323,
"reward_std": 0.6497415080666542,
"rewards/accuracy_reward": 0.5256696417927742,
"rewards/cosine_scaled_reward": 0.4248872734606266,
"rewards/format_reward": 0.9720982164144516,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 523.6540451049805,
"epoch": 0.27611940298507465,
"grad_norm": 0.2901047468185425,
"learning_rate": 9.957578053604837e-07,
"loss": 0.155,
"num_tokens": 23097323.0,
"reward": 1.9160521030426025,
"reward_std": 0.5634343735873699,
"rewards/accuracy_reward": 0.5234375074505806,
"rewards/cosine_scaled_reward": 0.4238645453006029,
"rewards/format_reward": 0.96875,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 471.8593978881836,
"epoch": 0.2835820895522388,
"grad_norm": 0.2969004511833191,
"learning_rate": 9.948684711767799e-07,
"loss": 0.1299,
"num_tokens": 23653853.0,
"reward": 1.9379696995019913,
"reward_std": 0.4608934037387371,
"rewards/accuracy_reward": 0.5189732126891613,
"rewards/cosine_scaled_reward": 0.434621412307024,
"rewards/format_reward": 0.984375,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 532.287971496582,
"epoch": 0.291044776119403,
"grad_norm": 0.2593567669391632,
"learning_rate": 9.938950460359912e-07,
"loss": 0.1593,
"num_tokens": 24272495.0,
"reward": 1.7254538089036942,
"reward_std": 0.5551125332713127,
"rewards/accuracy_reward": 0.4196428656578064,
"rewards/cosine_scaled_reward": 0.32478404976427555,
"rewards/format_reward": 0.9810267761349678,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 484.3884086608887,
"epoch": 0.29850746268656714,
"grad_norm": 0.279813677072525,
"learning_rate": 9.928376953482342e-07,
"loss": 0.1591,
"num_tokens": 24837707.0,
"reward": 1.905771628022194,
"reward_std": 0.4304558988660574,
"rewards/accuracy_reward": 0.5066964253783226,
"rewards/cosine_scaled_reward": 0.42139650508761406,
"rewards/format_reward": 0.9776785671710968,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 467.0167579650879,
"epoch": 0.30597014925373134,
"grad_norm": 0.486409068107605,
"learning_rate": 9.916965987847484e-07,
"loss": 0.1263,
"num_tokens": 25387114.0,
"reward": 1.8283725529909134,
"reward_std": 0.5463476590812206,
"rewards/accuracy_reward": 0.4620535746216774,
"rewards/cosine_scaled_reward": 0.38417606614530087,
"rewards/format_reward": 0.9821428582072258,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 461.70984268188477,
"epoch": 0.31343283582089554,
"grad_norm": 0.30666935443878174,
"learning_rate": 9.904719502473632e-07,
"loss": 0.1408,
"num_tokens": 25937686.0,
"reward": 1.784839078783989,
"reward_std": 0.5817533135414124,
"rewards/accuracy_reward": 0.4464285708963871,
"rewards/cosine_scaled_reward": 0.3629639744758606,
"rewards/format_reward": 0.9754464328289032,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 387.7968940734863,
"epoch": 0.3208955223880597,
"grad_norm": 0.3015042841434479,
"learning_rate": 9.89163957835551e-07,
"loss": 0.1362,
"num_tokens": 26423936.0,
"reward": 1.9611081928014755,
"reward_std": 0.5745424814522266,
"rewards/accuracy_reward": 0.5156250037252903,
"rewards/cosine_scaled_reward": 0.45664383843541145,
"rewards/format_reward": 0.9888392761349678,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 348.5122871398926,
"epoch": 0.3283582089552239,
"grad_norm": 0.34659165143966675,
"learning_rate": 9.877728438110645e-07,
"loss": 0.1396,
"num_tokens": 26857179.0,
"reward": 1.9145096093416214,
"reward_std": 0.5022773817181587,
"rewards/accuracy_reward": 0.4955357201397419,
"rewards/cosine_scaled_reward": 0.44241129234433174,
"rewards/format_reward": 0.9765625,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 332.58372497558594,
"epoch": 0.3358208955223881,
"grad_norm": 0.43583589792251587,
"learning_rate": 9.862988445601687e-07,
"loss": 0.169,
"num_tokens": 27290358.0,
"reward": 1.7086158692836761,
"reward_std": 0.429446816444397,
"rewards/accuracy_reward": 0.3816964291036129,
"rewards/cosine_scaled_reward": 0.3414282575249672,
"rewards/format_reward": 0.9854910671710968,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 278.4453239440918,
"epoch": 0.34328358208955223,
"grad_norm": 0.46366527676582336,
"learning_rate": 9.847422105534737e-07,
"loss": 0.1147,
"num_tokens": 27683117.0,
"reward": 1.891074076294899,
"reward_std": 0.536506325006485,
"rewards/accuracy_reward": 0.4654017798602581,
"rewards/cosine_scaled_reward": 0.43236861005425453,
"rewards/format_reward": 0.9933035597205162,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 242.423002243042,
"epoch": 0.35074626865671643,
"grad_norm": 0.5321322679519653,
"learning_rate": 9.831032063033724e-07,
"loss": 0.113,
"num_tokens": 28037664.0,
"reward": 1.8316063284873962,
"reward_std": 0.5450388044118881,
"rewards/accuracy_reward": 0.4308035634458065,
"rewards/cosine_scaled_reward": 0.4063830114901066,
"rewards/format_reward": 0.9944196417927742,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 193.76898002624512,
"epoch": 0.3582089552238806,
"grad_norm": 0.5765193104743958,
"learning_rate": 9.813821103190931e-07,
"loss": 0.1659,
"num_tokens": 28342873.0,
"reward": 1.669696494936943,
"reward_std": 0.4199746139347553,
"rewards/accuracy_reward": 0.3482142835855484,
"rewards/cosine_scaled_reward": 0.3315267227590084,
"rewards/format_reward": 0.9899553507566452,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 158.8895149230957,
"epoch": 0.3656716417910448,
"grad_norm": 1.0433906316757202,
"learning_rate": 9.795792150593738e-07,
"loss": 0.135,
"num_tokens": 28625046.0,
"reward": 1.7185450494289398,
"reward_std": 0.4323030523955822,
"rewards/accuracy_reward": 0.3727678544819355,
"rewards/cosine_scaled_reward": 0.3614020850509405,
"rewards/format_reward": 0.9843749925494194,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 140.64509391784668,
"epoch": 0.373134328358209,
"grad_norm": 2.962268829345703,
"learning_rate": 9.776948268827657e-07,
"loss": 0.1502,
"num_tokens": 28884872.0,
"reward": 1.5966612845659256,
"reward_std": 0.47611169144511223,
"rewards/accuracy_reward": 0.3058035708963871,
"rewards/cosine_scaled_reward": 0.29755405336618423,
"rewards/format_reward": 0.9933035597205162,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 110.19531726837158,
"epoch": 0.3805970149253731,
"grad_norm": 1.2034224271774292,
"learning_rate": 9.757292659955754e-07,
"loss": 0.1468,
"num_tokens": 29105703.0,
"reward": 1.6452730596065521,
"reward_std": 0.4458727203309536,
"rewards/accuracy_reward": 0.32924107275903225,
"rewards/cosine_scaled_reward": 0.32384440395981073,
"rewards/format_reward": 0.9921874925494194,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 88.12277317047119,
"epoch": 0.3880597014925373,
"grad_norm": 1.8556318283081055,
"learning_rate": 9.736828663974526e-07,
"loss": 0.1886,
"num_tokens": 29322037.0,
"reward": 1.5704896599054337,
"reward_std": 0.4764312729239464,
"rewards/accuracy_reward": 0.29017857275903225,
"rewards/cosine_scaled_reward": 0.28700744174420834,
"rewards/format_reward": 0.9933035671710968,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 69.49553918838501,
"epoch": 0.39552238805970147,
"grad_norm": 1.8629873991012573,
"learning_rate": 9.715559758246361e-07,
"loss": 0.1698,
"num_tokens": 29519473.0,
"reward": 1.5159788131713867,
"reward_std": 0.4448518790304661,
"rewards/accuracy_reward": 0.261160715483129,
"rewards/cosine_scaled_reward": 0.2581662777811289,
"rewards/format_reward": 0.9966517761349678,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 53.76785898208618,
"epoch": 0.40298507462686567,
"grad_norm": 2.3949267864227295,
"learning_rate": 9.69348955690864e-07,
"loss": 0.1639,
"num_tokens": 29700337.0,
"reward": 1.477759376168251,
"reward_std": 0.3300577197223902,
"rewards/accuracy_reward": 0.2410714291036129,
"rewards/cosine_scaled_reward": 0.2400361318141222,
"rewards/format_reward": 0.9966517761349678,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 41.593751430511475,
"epoch": 0.41044776119402987,
"grad_norm": 6.706897258758545,
"learning_rate": 9.670621810259594e-07,
"loss": 0.107,
"num_tokens": 29859917.0,
"reward": 1.6255246251821518,
"reward_std": 0.3502213731408119,
"rewards/accuracy_reward": 0.31473214365541935,
"rewards/cosine_scaled_reward": 0.3141406271606684,
"rewards/format_reward": 0.9966517761349678,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 36.96540403366089,
"epoch": 0.417910447761194,
"grad_norm": 3.161987781524658,
"learning_rate": 9.64696040412104e-07,
"loss": 0.1082,
"num_tokens": 30023526.0,
"reward": 1.4321366250514984,
"reward_std": 0.3272698614746332,
"rewards/accuracy_reward": 0.21986606623977423,
"rewards/cosine_scaled_reward": 0.21896691620349884,
"rewards/format_reward": 0.9933035746216774,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 28.55915331840515,
"epoch": 0.4253731343283582,
"grad_norm": 2.9937174320220947,
"learning_rate": 9.62250935917808e-07,
"loss": 0.0479,
"num_tokens": 30176771.0,
"reward": 1.4093515276908875,
"reward_std": 0.22921365313231945,
"rewards/accuracy_reward": 0.20535714086145163,
"rewards/cosine_scaled_reward": 0.20511038601398468,
"rewards/format_reward": 0.9988839253783226,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 26.62834930419922,
"epoch": 0.43283582089552236,
"grad_norm": 3.9241106510162354,
"learning_rate": 9.597272830295876e-07,
"loss": 0.035,
"num_tokens": 30335774.0,
"reward": 1.479707032442093,
"reward_std": 0.24030436016619205,
"rewards/accuracy_reward": 0.24107142724096775,
"rewards/cosine_scaled_reward": 0.24086768366396427,
"rewards/format_reward": 0.9977678507566452,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 25.078126192092896,
"epoch": 0.44029850746268656,
"grad_norm": 3.5471527576446533,
"learning_rate": 9.57125510581363e-07,
"loss": 0.023,
"num_tokens": 30487948.0,
"reward": 1.6225911229848862,
"reward_std": 0.31288249231874943,
"rewards/accuracy_reward": 0.31361607275903225,
"rewards/cosine_scaled_reward": 0.31120707653462887,
"rewards/format_reward": 0.9977678507566452,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 24.590402603149414,
"epoch": 0.44776119402985076,
"grad_norm": 2.711862802505493,
"learning_rate": 9.5444606068159e-07,
"loss": 0.0366,
"num_tokens": 30641445.0,
"reward": 1.433879777789116,
"reward_std": 0.2141956863924861,
"rewards/accuracy_reward": 0.21763392444700003,
"rewards/cosine_scaled_reward": 0.21624577604234219,
"rewards/format_reward": 1.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 23.093750953674316,
"epoch": 0.4552238805970149,
"grad_norm": 2.6093637943267822,
"learning_rate": 9.516893886381321e-07,
"loss": 0.0178,
"num_tokens": 30803937.0,
"reward": 1.3692706674337387,
"reward_std": 0.1967663299292326,
"rewards/accuracy_reward": 0.1863839286379516,
"rewards/cosine_scaled_reward": 0.18623489327728748,
"rewards/format_reward": 0.9966517761349678,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 22.19084930419922,
"epoch": 0.4626865671641791,
"grad_norm": 2.131438732147217,
"learning_rate": 9.488559628808938e-07,
"loss": 0.0064,
"num_tokens": 30953740.0,
"reward": 1.422857090830803,
"reward_std": 0.20307728182524443,
"rewards/accuracy_reward": 0.21205356996506453,
"rewards/cosine_scaled_reward": 0.21191950421780348,
"rewards/format_reward": 0.9988839253783226,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 22.098215341567993,
"epoch": 0.4701492537313433,
"grad_norm": 2.0213825702667236,
"learning_rate": 9.459462648822207e-07,
"loss": 0.0076,
"num_tokens": 31112844.0,
"reward": 1.4250895529985428,
"reward_std": 0.2019376672008093,
"rewards/accuracy_reward": 0.2131696455180645,
"rewards/cosine_scaled_reward": 0.21303593553602695,
"rewards/format_reward": 0.9988839253783226,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 22.00111722946167,
"epoch": 0.47761194029850745,
"grad_norm": 2.22880220413208,
"learning_rate": 9.429607890750862e-07,
"loss": 0.0049,
"num_tokens": 31274853.0,
"reward": 1.5980830639600754,
"reward_std": 0.21290546283125877,
"rewards/accuracy_reward": 0.29910714365541935,
"rewards/cosine_scaled_reward": 0.29897585324943066,
"rewards/format_reward": 1.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 21.63169765472412,
"epoch": 0.48507462686567165,
"grad_norm": 2.368590831756592,
"learning_rate": 9.399000427690734e-07,
"loss": 0.007,
"num_tokens": 31425235.0,
"reward": 1.475319281220436,
"reward_std": 0.19503124710172415,
"rewards/accuracy_reward": 0.2388392835855484,
"rewards/cosine_scaled_reward": 0.2375959688797593,
"rewards/format_reward": 0.9988839253783226,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 21.521206617355347,
"epoch": 0.4925373134328358,
"grad_norm": 1.893410325050354,
"learning_rate": 9.367645460641714e-07,
"loss": 0.0017,
"num_tokens": 31578222.0,
"reward": 1.4797853082418442,
"reward_std": 0.20832678768783808,
"rewards/accuracy_reward": 0.23995535541325808,
"rewards/cosine_scaled_reward": 0.23982988391071558,
"rewards/format_reward": 1.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 21.70647430419922,
"epoch": 0.5,
"grad_norm": 2.0883381366729736,
"learning_rate": 9.335548317623956e-07,
"loss": 0.002,
"num_tokens": 31726287.0,
"reward": 1.573533684015274,
"reward_std": 0.2080208584666252,
"rewards/accuracy_reward": 0.2868303582072258,
"rewards/cosine_scaled_reward": 0.28670324943959713,
"rewards/format_reward": 1.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 21.202009916305542,
"epoch": 0.5074626865671642,
"grad_norm": 2.551394462585449,
"learning_rate": 9.302714452772514e-07,
"loss": 0.0052,
"num_tokens": 31876548.0,
"reward": 1.4675123244524002,
"reward_std": 0.1835378697142005,
"rewards/accuracy_reward": 0.2343750037252903,
"rewards/cosine_scaled_reward": 0.23425334133207798,
"rewards/format_reward": 0.9988839253783226,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 21.535715103149414,
"epoch": 0.5149253731343284,
"grad_norm": 2.6030776500701904,
"learning_rate": 9.269149445410544e-07,
"loss": 0.0046,
"num_tokens": 32020812.0,
"reward": 1.5433960407972336,
"reward_std": 0.20367479603737593,
"rewards/accuracy_reward": 0.2723214318975806,
"rewards/cosine_scaled_reward": 0.2721905801445246,
"rewards/format_reward": 0.9988839253783226,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 20.599331617355347,
"epoch": 0.5223880597014925,
"grad_norm": 2.546696424484253,
"learning_rate": 9.23485899910123e-07,
"loss": 0.0003,
"num_tokens": 32169237.0,
"reward": 1.4831460118293762,
"reward_std": 0.17036813125014305,
"rewards/accuracy_reward": 0.24029876478016376,
"rewards/cosine_scaled_reward": 0.2465388011187315,
"rewards/format_reward": 0.9988839253783226,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 20.520090103149414,
"epoch": 0.5298507462686567,
"grad_norm": 2.1121726036071777,
"learning_rate": 9.199848940678605e-07,
"loss": 0.0011,
"num_tokens": 32314543.0,
"reward": 1.5132792741060257,
"reward_std": 0.1777313705533743,
"rewards/accuracy_reward": 0.25669643096625805,
"rewards/cosine_scaled_reward": 0.25658278726041317,
"rewards/format_reward": 1.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 21.00334882736206,
"epoch": 0.5373134328358209,
"grad_norm": 3.4874420166015625,
"learning_rate": 9.164125219257417e-07,
"loss": 0.0042,
"num_tokens": 32464130.0,
"reward": 1.4764407873153687,
"reward_std": 0.17472796607762575,
"rewards/accuracy_reward": 0.23883928544819355,
"rewards/cosine_scaled_reward": 0.23871749639511108,
"rewards/format_reward": 0.9988839253783226,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 20.36049222946167,
"epoch": 0.5447761194029851,
"grad_norm": 2.506866931915283,
"learning_rate": 9.127693905222223e-07,
"loss": -0.0017,
"num_tokens": 32614685.0,
"reward": 1.508817195892334,
"reward_std": 0.1598520427942276,
"rewards/accuracy_reward": 0.2555803544819355,
"rewards/cosine_scaled_reward": 0.2554689049720764,
"rewards/format_reward": 0.9977678507566452,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 20.82366180419922,
"epoch": 0.5522388059701493,
"grad_norm": 2.803783893585205,
"learning_rate": 9.090561189195869e-07,
"loss": 0.0032,
"num_tokens": 32764399.0,
"reward": 1.5980816781520844,
"reward_std": 0.1671485211700201,
"rewards/accuracy_reward": 0.29910713993012905,
"rewards/cosine_scaled_reward": 0.2989744786173105,
"rewards/format_reward": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 20.162947416305542,
"epoch": 0.5597014925373134,
"grad_norm": 2.7349565029144287,
"learning_rate": 9.052733380987554e-07,
"loss": -0.0034,
"num_tokens": 32921881.0,
"reward": 1.5400699526071548,
"reward_std": 0.15323490625996783,
"rewards/accuracy_reward": 0.2700892873108387,
"rewards/cosine_scaled_reward": 0.26998060569167137,
"rewards/format_reward": 1.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 20.48549175262451,
"epoch": 0.5671641791044776,
"grad_norm": 2.8505167961120605,
"learning_rate": 9.014216908520618e-07,
"loss": 0.0031,
"num_tokens": 33073868.0,
"reward": 1.4675205424427986,
"reward_std": 0.18623241062249463,
"rewards/accuracy_reward": 0.23437499813735485,
"rewards/cosine_scaled_reward": 0.23426154493154172,
"rewards/format_reward": 0.9988839253783226,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 20.43861675262451,
"epoch": 0.5746268656716418,
"grad_norm": 2.716010332107544,
"learning_rate": 8.975018316740277e-07,
"loss": 0.0014,
"num_tokens": 33228253.0,
"reward": 1.3793513923883438,
"reward_std": 0.13768241831448336,
"rewards/accuracy_reward": 0.18973214644938707,
"rewards/cosine_scaled_reward": 0.1896191742271185,
"rewards/format_reward": 1.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 20.793527603149414,
"epoch": 0.582089552238806,
"grad_norm": 2.40360164642334,
"learning_rate": 8.935144266501468e-07,
"loss": 0.0032,
"num_tokens": 33381900.0,
"reward": 1.5244351625442505,
"reward_std": 0.11904470889763274,
"rewards/accuracy_reward": 0.26227678917348385,
"rewards/cosine_scaled_reward": 0.2621582942083478,
"rewards/format_reward": 1.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 20.420759677886963,
"epoch": 0.5895522388059702,
"grad_norm": 1.7719311714172363,
"learning_rate": 8.894601533436998e-07,
"loss": 0.0,
"num_tokens": 33529069.0,
"reward": 1.4775669872760773,
"reward_std": 0.08695766101230618,
"rewards/accuracy_reward": 0.238839291036129,
"rewards/cosine_scaled_reward": 0.23872762825340033,
"rewards/format_reward": 1.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 21.217634916305542,
"epoch": 0.5970149253731343,
"grad_norm": 2.443143129348755,
"learning_rate": 8.853397006806181e-07,
"loss": 0.0005,
"num_tokens": 33688720.0,
"reward": 1.513253703713417,
"reward_std": 0.07582757750845559,
"rewards/accuracy_reward": 0.25669642724096775,
"rewards/cosine_scaled_reward": 0.25655714236199856,
"rewards/format_reward": 1.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 20.36495614051819,
"epoch": 0.6044776119402985,
"grad_norm": 2.328760862350464,
"learning_rate": 8.811537688324187e-07,
"loss": 0.0005,
"num_tokens": 33841447.0,
"reward": 1.5244421511888504,
"reward_std": 0.14594503585249186,
"rewards/accuracy_reward": 0.2622767873108387,
"rewards/cosine_scaled_reward": 0.2621652837842703,
"rewards/format_reward": 1.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 20.29017925262451,
"epoch": 0.6119402985074627,
"grad_norm": 2.725046396255493,
"learning_rate": 8.769030690972261e-07,
"loss": 0.001,
"num_tokens": 33983067.0,
"reward": 1.6338183134794235,
"reward_std": 0.10190565621059022,
"rewards/accuracy_reward": 0.31696428544819355,
"rewards/cosine_scaled_reward": 0.31685397773981094,
"rewards/format_reward": 1.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 20.447545528411865,
"epoch": 0.6194029850746269,
"grad_norm": 2.60213565826416,
"learning_rate": 8.725883237789044e-07,
"loss": 0.0015,
"num_tokens": 34142804.0,
"reward": 1.578012928366661,
"reward_std": 0.13737911762410704,
"rewards/accuracy_reward": 0.28906249813735485,
"rewards/cosine_scaled_reward": 0.2889503873884678,
"rewards/format_reward": 1.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 20.629465103149414,
"epoch": 0.6268656716417911,
"grad_norm": 3.175710439682007,
"learning_rate": 8.682102660643195e-07,
"loss": 0.0005,
"num_tokens": 34285752.0,
"reward": 1.6293497383594513,
"reward_std": 0.1396320709803831,
"rewards/accuracy_reward": 0.3147321417927742,
"rewards/cosine_scaled_reward": 0.31461753230541945,
"rewards/format_reward": 1.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 20.609375715255737,
"epoch": 0.6343283582089553,
"grad_norm": 1.994175672531128,
"learning_rate": 8.637696398987515e-07,
"loss": 0.0006,
"num_tokens": 34437546.0,
"reward": 1.4998853504657745,
"reward_std": 0.11160349007695913,
"rewards/accuracy_reward": 0.25,
"rewards/cosine_scaled_reward": 0.2498853299766779,
"rewards/format_reward": 1.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 20.772322177886963,
"epoch": 0.6417910447761194,
"grad_norm": 2.2707390785217285,
"learning_rate": 8.592671998594793e-07,
"loss": 0.0014,
"num_tokens": 34590166.0,
"reward": 1.569079726934433,
"reward_std": 0.12746261023711725,
"rewards/accuracy_reward": 0.2845982136204839,
"rewards/cosine_scaled_reward": 0.2844814406707883,
"rewards/format_reward": 1.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 20.42522406578064,
"epoch": 0.6492537313432836,
"grad_norm": 3.1332757472991943,
"learning_rate": 8.547037110275579e-07,
"loss": 0.0003,
"num_tokens": 34732587.0,
"reward": 1.5824772864580154,
"reward_std": 0.17397390864789486,
"rewards/accuracy_reward": 0.29129463993012905,
"rewards/cosine_scaled_reward": 0.2911826092749834,
"rewards/format_reward": 1.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 20.521206378936768,
"epoch": 0.6567164179104478,
"grad_norm": 3.331784248352051,
"learning_rate": 8.500799488578119e-07,
"loss": 0.0018,
"num_tokens": 34886118.0,
"reward": 1.517743095755577,
"reward_std": 0.10318425773594697,
"rewards/accuracy_reward": 0.25892857275903225,
"rewards/cosine_scaled_reward": 0.2588145062327385,
"rewards/format_reward": 1.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 20.195313453674316,
"epoch": 0.664179104477612,
"grad_norm": 2.983870506286621,
"learning_rate": 8.453966990470656e-07,
"loss": 0.0017,
"num_tokens": 35035173.0,
"reward": 1.4329265505075455,
"reward_std": 0.1654082857307344,
"rewards/accuracy_reward": 0.21651785587891936,
"rewards/cosine_scaled_reward": 0.21640866296365857,
"rewards/format_reward": 1.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 20.505581378936768,
"epoch": 0.6716417910447762,
"grad_norm": 2.7624764442443848,
"learning_rate": 8.406547574006324e-07,
"loss": 0.0028,
"num_tokens": 35170154.0,
"reward": 1.6784569025039673,
"reward_std": 0.11971815738125713,
"rewards/accuracy_reward": 0.33928571827709675,
"rewards/cosine_scaled_reward": 0.339171065017581,
"rewards/format_reward": 1.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 20.358259677886963,
"epoch": 0.6791044776119403,
"grad_norm": 3.5564069747924805,
"learning_rate": 8.358549296970875e-07,
"loss": 0.0004,
"num_tokens": 35322683.0,
"reward": 1.4887285381555557,
"reward_std": 0.10889045795626373,
"rewards/accuracy_reward": 0.24441964458674192,
"rewards/cosine_scaled_reward": 0.24430878367275,
"rewards/format_reward": 1.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 20.440849542617798,
"epoch": 0.6865671641791045,
"grad_norm": 3.2373881340026855,
"learning_rate": 8.309980315513442e-07,
"loss": 0.0002,
"num_tokens": 35471790.0,
"reward": 1.7141735553741455,
"reward_std": 0.15256303502246737,
"rewards/accuracy_reward": 0.3571428544819355,
"rewards/cosine_scaled_reward": 0.3570306524634361,
"rewards/format_reward": 1.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 20.72433114051819,
"epoch": 0.6940298507462687,
"grad_norm": 3.330997943878174,
"learning_rate": 8.260848882760615e-07,
"loss": -0.0002,
"num_tokens": 35625503.0,
"reward": 1.480910375714302,
"reward_std": 0.10137590842316513,
"rewards/accuracy_reward": 0.2410714291036129,
"rewards/cosine_scaled_reward": 0.24095493368804455,
"rewards/format_reward": 0.9988839253783226,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 20.22991156578064,
"epoch": 0.7014925373134329,
"grad_norm": 2.4767699241638184,
"learning_rate": 8.211163347414003e-07,
"loss": -0.0,
"num_tokens": 35774893.0,
"reward": 1.667300522327423,
"reward_std": 0.13715945463627577,
"rewards/accuracy_reward": 0.3337053582072258,
"rewards/cosine_scaled_reward": 0.3335950942710042,
"rewards/format_reward": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 20.570313692092896,
"epoch": 0.7089552238805971,
"grad_norm": 2.710425615310669,
"learning_rate": 8.160932152331586e-07,
"loss": 0.001,
"num_tokens": 35919596.0,
"reward": 1.5222073197364807,
"reward_std": 0.11986963993861366,
"rewards/accuracy_reward": 0.2611607192084193,
"rewards/cosine_scaled_reward": 0.2610465129837394,
"rewards/format_reward": 1.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 20.17745590209961,
"epoch": 0.7164179104477612,
"grad_norm": 2.4527230262756348,
"learning_rate": 8.110163833093049e-07,
"loss": -0.0014,
"num_tokens": 36073515.0,
"reward": 1.4708732217550278,
"reward_std": 0.1094957971945405,
"rewards/accuracy_reward": 0.23549107182770967,
"rewards/cosine_scaled_reward": 0.2353821201249957,
"rewards/format_reward": 1.0,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 20.28459882736206,
"epoch": 0.7238805970149254,
"grad_norm": 3.3163673877716064,
"learning_rate": 8.058867016549371e-07,
"loss": -0.0004,
"num_tokens": 36224698.0,
"reward": 1.5244434028863907,
"reward_std": 0.12670162599533796,
"rewards/accuracy_reward": 0.26227678544819355,
"rewards/cosine_scaled_reward": 0.2621665708720684,
"rewards/format_reward": 1.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 20.32366132736206,
"epoch": 0.7313432835820896,
"grad_norm": 2.859325408935547,
"learning_rate": 8.007050419356898e-07,
"loss": -0.0038,
"num_tokens": 36379460.0,
"reward": 1.5936386585235596,
"reward_std": 0.11910764441277877,
"rewards/accuracy_reward": 0.2968750074505806,
"rewards/cosine_scaled_reward": 0.2967636212706566,
"rewards/format_reward": 1.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 20.453126192092896,
"epoch": 0.7388059701492538,
"grad_norm": 2.248077154159546,
"learning_rate": 7.954722846496149e-07,
"loss": 0.003,
"num_tokens": 36535562.0,
"reward": 1.5244402140378952,
"reward_std": 0.08259574370241296,
"rewards/accuracy_reward": 0.2622767873108387,
"rewards/cosine_scaled_reward": 0.2621633689850569,
"rewards/format_reward": 1.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 20.748884916305542,
"epoch": 0.746268656716418,
"grad_norm": 2.1548383235931396,
"learning_rate": 7.901893189775639e-07,
"loss": 0.0007,
"num_tokens": 36690505.0,
"reward": 1.4797910004854202,
"reward_std": 0.08582926816011138,
"rewards/accuracy_reward": 0.23995536100119352,
"rewards/cosine_scaled_reward": 0.23983560875058174,
"rewards/format_reward": 1.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 20.24776864051819,
"epoch": 0.753731343283582,
"grad_norm": 2.7886292934417725,
"learning_rate": 7.848570426320916e-07,
"loss": 0.0015,
"num_tokens": 36832751.0,
"reward": 1.6460942327976227,
"reward_std": 0.10873846150510502,
"rewards/accuracy_reward": 0.32477678917348385,
"rewards/cosine_scaled_reward": 0.3246655622497201,
"rewards/format_reward": 0.9966517835855484,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 20.510045528411865,
"epoch": 0.7611940298507462,
"grad_norm": 2.756274700164795,
"learning_rate": 7.794763617049123e-07,
"loss": 0.0005,
"num_tokens": 36983200.0,
"reward": 1.7454221993684769,
"reward_std": 0.10062572493555422,
"rewards/accuracy_reward": 0.3727678582072258,
"rewards/cosine_scaled_reward": 0.372654240578413,
"rewards/format_reward": 1.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 20.693081378936768,
"epoch": 0.7686567164179104,
"grad_norm": 1.6583493947982788,
"learning_rate": 7.740481905129306e-07,
"loss": 0.0009,
"num_tokens": 37135357.0,
"reward": 1.5065791308879852,
"reward_std": 0.045692659896090504,
"rewards/accuracy_reward": 0.2533482136204839,
"rewards/cosine_scaled_reward": 0.2532308688387275,
"rewards/format_reward": 1.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 20.45424199104309,
"epoch": 0.7761194029850746,
"grad_norm": 3.4718310832977295,
"learning_rate": 7.685734514428766e-07,
"loss": 0.0004,
"num_tokens": 37286476.0,
"reward": 1.4396189004182816,
"reward_std": 0.07011978597014945,
"rewards/accuracy_reward": 0.21986607369035482,
"rewards/cosine_scaled_reward": 0.2197527764365077,
"rewards/format_reward": 1.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 20.142858266830444,
"epoch": 0.7835820895522388,
"grad_norm": 3.465782642364502,
"learning_rate": 7.630530747945672e-07,
"loss": -0.0001,
"num_tokens": 37435684.0,
"reward": 1.4943107217550278,
"reward_std": 0.08559985571561413,
"rewards/accuracy_reward": 0.2477678619325161,
"rewards/cosine_scaled_reward": 0.2476588897407055,
"rewards/format_reward": 0.9988839253783226,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 20.049107789993286,
"epoch": 0.7910447761194029,
"grad_norm": 2.42425274848938,
"learning_rate": 7.574879986228244e-07,
"loss": -0.0001,
"num_tokens": 37582544.0,
"reward": 1.5132858008146286,
"reward_std": 0.08311572534432088,
"rewards/accuracy_reward": 0.2566964318975806,
"rewards/cosine_scaled_reward": 0.25658932141959667,
"rewards/format_reward": 1.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 20.716518878936768,
"epoch": 0.7985074626865671,
"grad_norm": 2.004718065261841,
"learning_rate": 7.518791685780768e-07,
"loss": 0.0029,
"num_tokens": 37739602.0,
"reward": 1.6248817294836044,
"reward_std": 0.07741166379830844,
"rewards/accuracy_reward": 0.3125,
"rewards/cosine_scaled_reward": 0.3123816456645727,
"rewards/format_reward": 1.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 20.168527364730835,
"epoch": 0.8059701492537313,
"grad_norm": 3.211866855621338,
"learning_rate": 7.462275377456669e-07,
"loss": 0.0001,
"num_tokens": 37891401.0,
"reward": 1.5891772359609604,
"reward_std": 0.09964832732346451,
"rewards/accuracy_reward": 0.2946428610011935,
"rewards/cosine_scaled_reward": 0.2945342995226383,
"rewards/format_reward": 1.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 20.299107789993286,
"epoch": 0.8134328358208955,
"grad_norm": 2.558931589126587,
"learning_rate": 7.405340664838993e-07,
"loss": 0.0004,
"num_tokens": 38042885.0,
"reward": 1.4619427621364594,
"reward_std": 0.11446394885876998,
"rewards/accuracy_reward": 0.23102678544819355,
"rewards/cosine_scaled_reward": 0.23091593850404024,
"rewards/format_reward": 1.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 20.55022406578064,
"epoch": 0.8208955223880597,
"grad_norm": 2.110745906829834,
"learning_rate": 7.347997222608492e-07,
"loss": -0.0002,
"num_tokens": 38197242.0,
"reward": 1.5735462754964828,
"reward_std": 0.0649347297767946,
"rewards/accuracy_reward": 0.2868303544819355,
"rewards/cosine_scaled_reward": 0.2867158204317093,
"rewards/format_reward": 1.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 20.551340103149414,
"epoch": 0.8283582089552238,
"grad_norm": 1.8552799224853516,
"learning_rate": 7.290254794899664e-07,
"loss": 0.002,
"num_tokens": 38340640.0,
"reward": 1.5255557298660278,
"reward_std": 0.06177972303260404,
"rewards/accuracy_reward": 0.26339286006987095,
"rewards/cosine_scaled_reward": 0.26327887177467346,
"rewards/format_reward": 0.9988839253783226,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 20.752233266830444,
"epoch": 0.835820895522388,
"grad_norm": 2.4142839908599854,
"learning_rate": 7.232123193644956e-07,
"loss": 0.0006,
"num_tokens": 38500346.0,
"reward": 1.51997210085392,
"reward_std": 0.07079227790242726,
"rewards/accuracy_reward": 0.2600446445867419,
"rewards/cosine_scaled_reward": 0.25992743112146854,
"rewards/format_reward": 1.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 20.882813215255737,
"epoch": 0.8432835820895522,
"grad_norm": 2.0007951259613037,
"learning_rate": 7.173612296907472e-07,
"loss": 0.0001,
"num_tokens": 38658729.0,
"reward": 1.6516644805669785,
"reward_std": 0.10580981522798538,
"rewards/accuracy_reward": 0.3258928544819355,
"rewards/cosine_scaled_reward": 0.3257715832442045,
"rewards/format_reward": 1.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 20.75558114051819,
"epoch": 0.8507462686567164,
"grad_norm": 2.6902952194213867,
"learning_rate": 7.114732047202432e-07,
"loss": -0.0002,
"num_tokens": 38805462.0,
"reward": 1.6114880591630936,
"reward_std": 0.09198851990785784,
"rewards/accuracy_reward": 0.3058035708963871,
"rewards/cosine_scaled_reward": 0.3056844547390938,
"rewards/format_reward": 1.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 20.13616156578064,
"epoch": 0.8582089552238806,
"grad_norm": 1.9690958261489868,
"learning_rate": 7.055492449807683e-07,
"loss": 0.0005,
"num_tokens": 38959272.0,
"reward": 1.5489989072084427,
"reward_std": 0.08409377404399265,
"rewards/accuracy_reward": 0.27455357648432255,
"rewards/cosine_scaled_reward": 0.27444526366889477,
"rewards/format_reward": 1.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 20.392857789993286,
"epoch": 0.8656716417910447,
"grad_norm": 1.6137839555740356,
"learning_rate": 6.99590357106354e-07,
"loss": 0.0002,
"num_tokens": 39103456.0,
"reward": 1.5356025993824005,
"reward_std": 0.06981676115469782,
"rewards/accuracy_reward": 0.2678571445867419,
"rewards/cosine_scaled_reward": 0.2677453998476267,
"rewards/format_reward": 1.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 20.395090103149414,
"epoch": 0.8731343283582089,
"grad_norm": 3.123340129852295,
"learning_rate": 6.935975536662253e-07,
"loss": 0.0011,
"num_tokens": 39248058.0,
"reward": 1.589174211025238,
"reward_std": 0.12407711929557763,
"rewards/accuracy_reward": 0.29464285634458065,
"rewards/cosine_scaled_reward": 0.29453128995373845,
"rewards/format_reward": 1.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 20.720983028411865,
"epoch": 0.8805970149253731,
"grad_norm": 2.0955867767333984,
"learning_rate": 6.875718529927404e-07,
"loss": 0.0006,
"num_tokens": 39410136.0,
"reward": 1.5802399963140488,
"reward_std": 0.0863498275235628,
"rewards/accuracy_reward": 0.2901785708963871,
"rewards/cosine_scaled_reward": 0.2900614067912102,
"rewards/format_reward": 1.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 20.71651864051819,
"epoch": 0.8880597014925373,
"grad_norm": 3.44924259185791,
"learning_rate": 6.815142790083473e-07,
"loss": 0.0025,
"num_tokens": 39569234.0,
"reward": 1.7007768154144287,
"reward_std": 0.14646728224154515,
"rewards/accuracy_reward": 0.3504464291036129,
"rewards/cosine_scaled_reward": 0.35033031180500984,
"rewards/format_reward": 1.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 20.483259916305542,
"epoch": 0.8955223880597015,
"grad_norm": 2.0056583881378174,
"learning_rate": 6.754258610515948e-07,
"loss": 0.0015,
"num_tokens": 39727235.0,
"reward": 1.4820292592048645,
"reward_std": 0.07951601898218996,
"rewards/accuracy_reward": 0.24107143096625805,
"rewards/cosine_scaled_reward": 0.24095771089196205,
"rewards/format_reward": 1.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 20.08147430419922,
"epoch": 0.9029850746268657,
"grad_norm": 2.79129958152771,
"learning_rate": 6.69307633702221e-07,
"loss": 0.0001,
"num_tokens": 39874236.0,
"reward": 1.5199817568063736,
"reward_std": 0.08146786268110873,
"rewards/accuracy_reward": 0.2600446455180645,
"rewards/cosine_scaled_reward": 0.2599370051175356,
"rewards/format_reward": 1.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 20.085938453674316,
"epoch": 0.9104477611940298,
"grad_norm": 3.3468284606933594,
"learning_rate": 6.631606366053506e-07,
"loss": -0.0003,
"num_tokens": 40015729.0,
"reward": 1.7141781598329544,
"reward_std": 0.10137949584012773,
"rewards/accuracy_reward": 0.3571428582072258,
"rewards/cosine_scaled_reward": 0.35703518986701965,
"rewards/format_reward": 1.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 20.622768878936768,
"epoch": 0.917910447761194,
"grad_norm": 2.6946330070495605,
"learning_rate": 6.569859142948327e-07,
"loss": 0.0021,
"num_tokens": 40163039.0,
"reward": 1.5467600226402283,
"reward_std": 0.11038771457970142,
"rewards/accuracy_reward": 0.27343749813735485,
"rewards/cosine_scaled_reward": 0.2733224518597126,
"rewards/format_reward": 1.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 20.418527841567993,
"epoch": 0.9253731343283582,
"grad_norm": 2.7771787643432617,
"learning_rate": 6.507845160157475e-07,
"loss": 0.0004,
"num_tokens": 40317998.0,
"reward": 1.6159598380327225,
"reward_std": 0.07643294239515797,
"rewards/accuracy_reward": 0.30803571827709675,
"rewards/cosine_scaled_reward": 0.30792406760156155,
"rewards/format_reward": 1.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 20.400670289993286,
"epoch": 0.9328358208955224,
"grad_norm": 1.899020791053772,
"learning_rate": 6.445574955461133e-07,
"loss": -0.0025,
"num_tokens": 40465605.0,
"reward": 1.4998881071805954,
"reward_std": 0.10077201342210174,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.2498880298808217,
"rewards/format_reward": 1.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 20.99330425262451,
"epoch": 0.9402985074626866,
"grad_norm": 1.1476165056228638,
"learning_rate": 6.383059110178203e-07,
"loss": 0.0009,
"num_tokens": 40617031.0,
"reward": 1.6114817261695862,
"reward_std": 0.0364510658172037,
"rewards/accuracy_reward": 0.30580357275903225,
"rewards/cosine_scaled_reward": 0.3056781152263284,
"rewards/format_reward": 1.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 20.44642949104309,
"epoch": 0.9477611940298507,
"grad_norm": 2.6259467601776123,
"learning_rate": 6.320308247368284e-07,
"loss": 0.0001,
"num_tokens": 40772791.0,
"reward": 1.5289026349782944,
"reward_std": 0.13789613312110305,
"rewards/accuracy_reward": 0.2645089328289032,
"rewards/cosine_scaled_reward": 0.2643936015665531,
"rewards/format_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 20.164063453674316,
"epoch": 0.9552238805970149,
"grad_norm": 1.8769416809082031,
"learning_rate": 6.257333030026538e-07,
"loss": -0.0007,
"num_tokens": 40921778.0,
"reward": 1.5445343106985092,
"reward_std": 0.07823521745721607,
"rewards/accuracy_reward": 0.2723214318975806,
"rewards/cosine_scaled_reward": 0.2722127726301551,
"rewards/format_reward": 1.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 20.060268878936768,
"epoch": 0.9626865671641791,
"grad_norm": 3.188025951385498,
"learning_rate": 6.194144159271755e-07,
"loss": 0.0006,
"num_tokens": 41068368.0,
"reward": 1.4440890699625015,
"reward_std": 0.07432721156590105,
"rewards/accuracy_reward": 0.22209821362048388,
"rewards/cosine_scaled_reward": 0.22199082095175982,
"rewards/format_reward": 1.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 20.361608028411865,
"epoch": 0.9701492537313433,
"grad_norm": 1.6484222412109375,
"learning_rate": 6.130752372527981e-07,
"loss": 0.0019,
"num_tokens": 41215084.0,
"reward": 1.4931926876306534,
"reward_std": 0.051030852994102816,
"rewards/accuracy_reward": 0.246651791036129,
"rewards/cosine_scaled_reward": 0.24654078483581543,
"rewards/format_reward": 1.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 20.842634677886963,
"epoch": 0.9776119402985075,
"grad_norm": 3.736074447631836,
"learning_rate": 6.067168441699927e-07,
"loss": 0.0003,
"num_tokens": 41368055.0,
"reward": 1.6940742880105972,
"reward_std": 0.12490348052233458,
"rewards/accuracy_reward": 0.3470982164144516,
"rewards/cosine_scaled_reward": 0.3469760064035654,
"rewards/format_reward": 1.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 20.293527603149414,
"epoch": 0.9850746268656716,
"grad_norm": 11.65377140045166,
"learning_rate": 6.003403171342562e-07,
"loss": -0.0001,
"num_tokens": 41521014.0,
"reward": 1.388282224535942,
"reward_std": 0.06252689357782515,
"rewards/accuracy_reward": 0.19419642724096775,
"rewards/cosine_scaled_reward": 0.19408572791144252,
"rewards/format_reward": 1.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 20.136138439178467,
"epoch": 0.9925373134328358,
"grad_norm": 3.632760763168335,
"learning_rate": 5.939467396825136e-07,
"loss": 0.0005,
"num_tokens": 41671398.0,
"reward": 1.4574763923883438,
"reward_std": 0.08762641241588653,
"rewards/accuracy_reward": 0.2287946417927742,
"rewards/cosine_scaled_reward": 0.22868163883686066,
"rewards/format_reward": 1.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 20.29799175262451,
"epoch": 1.007462686567164,
"grad_norm": 5.531825542449951,
"learning_rate": 5.875371982489958e-07,
"loss": 0.0008,
"num_tokens": 41808033.0,
"reward": 1.5132822692394257,
"reward_std": 0.07973260305406171,
"rewards/accuracy_reward": 0.2566964328289032,
"rewards/cosine_scaled_reward": 0.2565857656300068,
"rewards/format_reward": 1.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 21.155134916305542,
"epoch": 1.0149253731343284,
"grad_norm": 4.794013023376465,
"learning_rate": 5.811127819806276e-07,
"loss": -0.0001,
"num_tokens": 41952708.0,
"reward": 1.6561159640550613,
"reward_std": 0.10551196637798199,
"rewards/accuracy_reward": 0.32812500186264515,
"rewards/cosine_scaled_reward": 0.32799087278544903,
"rewards/format_reward": 1.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 20.325893878936768,
"epoch": 1.0223880597014925,
"grad_norm": 4.288095474243164,
"learning_rate": 5.746745825519538e-07,
"loss": -0.0005,
"num_tokens": 42096168.0,
"reward": 1.6539065688848495,
"reward_std": 0.06463327458860135,
"rewards/accuracy_reward": 0.32700893096625805,
"rewards/cosine_scaled_reward": 0.3268975578248501,
"rewards/format_reward": 1.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 20.46428680419922,
"epoch": 1.0298507462686568,
"grad_norm": 2.075472593307495,
"learning_rate": 5.682236939796336e-07,
"loss": 0.0006,
"num_tokens": 42245792.0,
"reward": 1.682920902967453,
"reward_std": 0.11724273651551442,
"rewards/accuracy_reward": 0.34151786006987095,
"rewards/cosine_scaled_reward": 0.34140297770500183,
"rewards/format_reward": 1.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 20.25334930419922,
"epoch": 1.037313432835821,
"grad_norm": 3.787569046020508,
"learning_rate": 5.61761212436541e-07,
"loss": 0.0007,
"num_tokens": 42387403.0,
"reward": 1.5601582527160645,
"reward_std": 0.09724155126728817,
"rewards/accuracy_reward": 0.2801339253783226,
"rewards/cosine_scaled_reward": 0.28002420626580715,
"rewards/format_reward": 1.0,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 20.424108266830444,
"epoch": 1.044776119402985,
"grad_norm": 2.1570165157318115,
"learning_rate": 5.552882360654949e-07,
"loss": 0.0008,
"num_tokens": 42536287.0,
"reward": 1.627119928598404,
"reward_std": 0.10236127915219129,
"rewards/accuracy_reward": 0.313616075553,
"rewards/cosine_scaled_reward": 0.3135037589818239,
"rewards/format_reward": 1.0,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 20.625000953674316,
"epoch": 1.0522388059701493,
"grad_norm": 3.6223268508911133,
"learning_rate": 5.488058647926577e-07,
"loss": 0.001,
"num_tokens": 42696151.0,
"reward": 1.535599023103714,
"reward_std": 0.09071048016893712,
"rewards/accuracy_reward": 0.2654533013701439,
"rewards/cosine_scaled_reward": 0.27220611833035946,
"rewards/format_reward": 1.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 20.46428632736206,
"epoch": 1.0597014925373134,
"grad_norm": 3.7596476078033447,
"learning_rate": 5.423152001406282e-07,
"loss": -0.0003,
"num_tokens": 42855279.0,
"reward": 1.6070294231176376,
"reward_std": 0.12858991045504808,
"rewards/accuracy_reward": 0.30357142724096775,
"rewards/cosine_scaled_reward": 0.3034578887745738,
"rewards/format_reward": 1.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 20.022322177886963,
"epoch": 1.0671641791044777,
"grad_norm": 3.700883150100708,
"learning_rate": 5.358173450412648e-07,
"loss": -0.0001,
"num_tokens": 43008275.0,
"reward": 1.5623932778835297,
"reward_std": 0.1167864422913425,
"rewards/accuracy_reward": 0.2812500009313226,
"rewards/cosine_scaled_reward": 0.28114316891878843,
"rewards/format_reward": 1.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 20.39955425262451,
"epoch": 1.0746268656716418,
"grad_norm": 4.979684829711914,
"learning_rate": 5.293134036482698e-07,
"loss": 0.0,
"num_tokens": 43159225.0,
"reward": 1.5132808834314346,
"reward_std": 0.09190170587856983,
"rewards/accuracy_reward": 0.2566964328289032,
"rewards/cosine_scaled_reward": 0.25658440589904785,
"rewards/format_reward": 1.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 20.123884677886963,
"epoch": 1.0820895522388059,
"grad_norm": 2.860994338989258,
"learning_rate": 5.228044811495631e-07,
"loss": -0.0003,
"num_tokens": 43309240.0,
"reward": 1.582481175661087,
"reward_std": 0.0821403276665933,
"rewards/accuracy_reward": 0.2912946417927742,
"rewards/cosine_scaled_reward": 0.29118647053837776,
"rewards/format_reward": 1.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 20.095982789993286,
"epoch": 1.0895522388059702,
"grad_norm": 3.0046682357788086,
"learning_rate": 5.162916835794843e-07,
"loss": 0.0002,
"num_tokens": 43458758.0,
"reward": 1.5556955337524414,
"reward_std": 0.07936285677858734,
"rewards/accuracy_reward": 0.2779017873108387,
"rewards/cosine_scaled_reward": 0.27779373340308666,
"rewards/format_reward": 1.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 20.234375476837158,
"epoch": 1.0970149253731343,
"grad_norm": 4.488862991333008,
"learning_rate": 5.09776117630847e-07,
"loss": 0.0004,
"num_tokens": 43608960.0,
"reward": 1.6360509097576141,
"reward_std": 0.07161617746324822,
"rewards/accuracy_reward": 0.3180803544819355,
"rewards/cosine_scaled_reward": 0.31797049194574356,
"rewards/format_reward": 1.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 20.46651864051819,
"epoch": 1.1044776119402986,
"grad_norm": 1.4745782613754272,
"learning_rate": 5.032588904668851e-07,
"loss": -0.0036,
"num_tokens": 43754050.0,
"reward": 1.7454189360141754,
"reward_std": 0.05425459118813336,
"rewards/accuracy_reward": 0.37276786379516125,
"rewards/cosine_scaled_reward": 0.37265095487236977,
"rewards/format_reward": 1.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 20.15736722946167,
"epoch": 1.1119402985074627,
"grad_norm": 4.071691513061523,
"learning_rate": 4.967411095331149e-07,
"loss": 0.0004,
"num_tokens": 43899775.0,
"reward": 1.6427484452724457,
"reward_std": 0.09672015977940873,
"rewards/accuracy_reward": 0.32142856903374195,
"rewards/cosine_scaled_reward": 0.32131983526051044,
"rewards/format_reward": 1.0,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 20.150670528411865,
"epoch": 1.1194029850746268,
"grad_norm": 5.170862197875977,
"learning_rate": 4.90223882369153e-07,
"loss": 0.001,
"num_tokens": 44050078.0,
"reward": 1.629356011748314,
"reward_std": 0.12038855330866483,
"rewards/accuracy_reward": 0.3147321455180645,
"rewards/cosine_scaled_reward": 0.3146238140761852,
"rewards/format_reward": 1.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 20.264509439468384,
"epoch": 1.126865671641791,
"grad_norm": 2.8727357387542725,
"learning_rate": 4.837083164205159e-07,
"loss": 0.0013,
"num_tokens": 44198707.0,
"reward": 1.5356042981147766,
"reward_std": 0.13752735047977538,
"rewards/accuracy_reward": 0.2678571455180645,
"rewards/cosine_scaled_reward": 0.26774709299206734,
"rewards/format_reward": 1.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 20.49330425262451,
"epoch": 1.1343283582089552,
"grad_norm": 2.9410152435302734,
"learning_rate": 4.77195518850437e-07,
"loss": -0.0017,
"num_tokens": 44350965.0,
"reward": 1.5467612594366074,
"reward_std": 0.08439550402343343,
"rewards/accuracy_reward": 0.27343750186264515,
"rewards/cosine_scaled_reward": 0.2733236690983176,
"rewards/format_reward": 1.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 20.28459882736206,
"epoch": 1.1417910447761195,
"grad_norm": 5.266554832458496,
"learning_rate": 4.7068659635173025e-07,
"loss": -0.0006,
"num_tokens": 44496740.0,
"reward": 1.571318507194519,
"reward_std": 0.09055654217311826,
"rewards/accuracy_reward": 0.2857142873108387,
"rewards/cosine_scaled_reward": 0.2856041230261326,
"rewards/format_reward": 1.0,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 20.398438453674316,
"epoch": 1.1492537313432836,
"grad_norm": 2.222485303878784,
"learning_rate": 4.6418265495873516e-07,
"loss": -0.0004,
"num_tokens": 44656433.0,
"reward": 1.667298749089241,
"reward_std": 0.10092825663519989,
"rewards/accuracy_reward": 0.3337053582072258,
"rewards/cosine_scaled_reward": 0.3335932996124029,
"rewards/format_reward": 1.0,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 20.23995590209961,
"epoch": 1.1567164179104479,
"grad_norm": 2.372213840484619,
"learning_rate": 4.5768479985937194e-07,
"loss": 0.002,
"num_tokens": 44807600.0,
"reward": 1.624890297651291,
"reward_std": 0.13548944082560865,
"rewards/accuracy_reward": 0.31249999813735485,
"rewards/cosine_scaled_reward": 0.31239020079374313,
"rewards/format_reward": 1.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 20.25558114051819,
"epoch": 1.164179104477612,
"grad_norm": 1.6955794095993042,
"learning_rate": 4.511941352073424e-07,
"loss": 0.0005,
"num_tokens": 44957893.0,
"reward": 1.5780149698257446,
"reward_std": 0.07597658339989977,
"rewards/accuracy_reward": 0.28906250558793545,
"rewards/cosine_scaled_reward": 0.2889523971825838,
"rewards/format_reward": 1.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 20.380581378936768,
"epoch": 1.171641791044776,
"grad_norm": 2.024655342102051,
"learning_rate": 4.4471176393450515e-07,
"loss": 0.001,
"num_tokens": 45113066.0,
"reward": 1.4954225569963455,
"reward_std": 0.09183560762491538,
"rewards/accuracy_reward": 0.2477678582072258,
"rewards/cosine_scaled_reward": 0.24765462800860405,
"rewards/format_reward": 1.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 19.982143878936768,
"epoch": 1.1791044776119404,
"grad_norm": 3.3944594860076904,
"learning_rate": 4.382387875634591e-07,
"loss": 0.0004,
"num_tokens": 45261530.0,
"reward": 1.6472150832414627,
"reward_std": 0.10724194766953588,
"rewards/accuracy_reward": 0.32366071455180645,
"rewards/cosine_scaled_reward": 0.32355429045856,
"rewards/format_reward": 1.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 20.106027603149414,
"epoch": 1.1865671641791045,
"grad_norm": 1.5790033340454102,
"learning_rate": 4.317763060203664e-07,
"loss": 0.0002,
"num_tokens": 45416145.0,
"reward": 1.4440883994102478,
"reward_std": 0.06350326852842159,
"rewards/accuracy_reward": 0.22209821734577417,
"rewards/cosine_scaled_reward": 0.2219901392236352,
"rewards/format_reward": 1.0,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 20.33705449104309,
"epoch": 1.1940298507462686,
"grad_norm": 1.6651524305343628,
"learning_rate": 4.253254174480462e-07,
"loss": -0.0,
"num_tokens": 45560303.0,
"reward": 1.5333705618977547,
"reward_std": 0.040959979950784486,
"rewards/accuracy_reward": 0.266741075553,
"rewards/cosine_scaled_reward": 0.2666294459568235,
"rewards/format_reward": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 20.408483028411865,
"epoch": 1.2014925373134329,
"grad_norm": 3.0133707523345947,
"learning_rate": 4.1888721801937226e-07,
"loss": 0.001,
"num_tokens": 45701981.0,
"reward": 1.5891735255718231,
"reward_std": 0.08890740286335586,
"rewards/accuracy_reward": 0.2946428610011935,
"rewards/cosine_scaled_reward": 0.2945305937901139,
"rewards/format_reward": 1.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 20.158483266830444,
"epoch": 1.208955223880597,
"grad_norm": 2.2523109912872314,
"learning_rate": 4.124628017510042e-07,
"loss": -0.0028,
"num_tokens": 45851651.0,
"reward": 1.5891766995191574,
"reward_std": 0.08664929727092385,
"rewards/accuracy_reward": 0.2946428647264838,
"rewards/cosine_scaled_reward": 0.2945337858982384,
"rewards/format_reward": 1.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 20.29017925262451,
"epoch": 1.2164179104477613,
"grad_norm": 1.8585433959960938,
"learning_rate": 4.0605326031748646e-07,
"loss": -0.0001,
"num_tokens": 45994527.0,
"reward": 1.5445328205823898,
"reward_std": 0.05591056302149866,
"rewards/accuracy_reward": 0.27232143096625805,
"rewards/cosine_scaled_reward": 0.27221135422587395,
"rewards/format_reward": 1.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 20.445313453674316,
"epoch": 1.2238805970149254,
"grad_norm": 5.333817481994629,
"learning_rate": 3.9965968286574367e-07,
"loss": 0.0007,
"num_tokens": 46140630.0,
"reward": 1.4463159441947937,
"reward_std": 0.08372260938289244,
"rewards/accuracy_reward": 0.22321428637951612,
"rewards/cosine_scaled_reward": 0.2231016056612134,
"rewards/format_reward": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 20.148438215255737,
"epoch": 1.2313432835820897,
"grad_norm": 2.544990301132202,
"learning_rate": 3.9328315583000737e-07,
"loss": 0.0002,
"num_tokens": 46290443.0,
"reward": 1.636052206158638,
"reward_std": 0.09034244809299707,
"rewards/accuracy_reward": 0.3180803582072258,
"rewards/cosine_scaled_reward": 0.31797176599502563,
"rewards/format_reward": 1.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 20.42299175262451,
"epoch": 1.2388059701492538,
"grad_norm": 4.120884895324707,
"learning_rate": 3.869247627472021e-07,
"loss": 0.0018,
"num_tokens": 46439446.0,
"reward": 1.653905838727951,
"reward_std": 0.10355404989928729,
"rewards/accuracy_reward": 0.3270089291036129,
"rewards/cosine_scaled_reward": 0.32689686864614487,
"rewards/format_reward": 1.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 20.90736722946167,
"epoch": 1.2462686567164178,
"grad_norm": 6.220430374145508,
"learning_rate": 3.805855840728246e-07,
"loss": -0.0009,
"num_tokens": 46583603.0,
"reward": 1.5690757930278778,
"reward_std": 0.1022080342995082,
"rewards/accuracy_reward": 0.2845982126891613,
"rewards/cosine_scaled_reward": 0.2844774592667818,
"rewards/format_reward": 1.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 20.39955425262451,
"epoch": 1.2537313432835822,
"grad_norm": 3.827559232711792,
"learning_rate": 3.7426669699734626e-07,
"loss": 0.0,
"num_tokens": 46729329.0,
"reward": 1.5199765115976334,
"reward_std": 0.07372096207812717,
"rewards/accuracy_reward": 0.26004464365541935,
"rewards/cosine_scaled_reward": 0.2599317729473114,
"rewards/format_reward": 1.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 20.420759439468384,
"epoch": 1.2611940298507462,
"grad_norm": 2.8399739265441895,
"learning_rate": 3.679691752631715e-07,
"loss": 0.0012,
"num_tokens": 46881738.0,
"reward": 1.6873881220817566,
"reward_std": 0.028184092890906953,
"rewards/accuracy_reward": 0.34375000558793545,
"rewards/cosine_scaled_reward": 0.3436380457133055,
"rewards/format_reward": 1.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 20.460938453674316,
"epoch": 1.2686567164179103,
"grad_norm": 3.9981281757354736,
"learning_rate": 3.6169408898217966e-07,
"loss": -0.0007,
"num_tokens": 47032783.0,
"reward": 1.5735474079847336,
"reward_std": 0.09409430988146994,
"rewards/accuracy_reward": 0.28683035261929035,
"rewards/cosine_scaled_reward": 0.28671699203550816,
"rewards/format_reward": 1.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 20.434152603149414,
"epoch": 1.2761194029850746,
"grad_norm": 4.026747226715088,
"learning_rate": 3.554425044538867e-07,
"loss": -0.0003,
"num_tokens": 47186252.0,
"reward": 1.6293520778417587,
"reward_std": 0.06463150960456687,
"rewards/accuracy_reward": 0.31473214738070965,
"rewards/cosine_scaled_reward": 0.31461989507079124,
"rewards/format_reward": 1.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 20.639509677886963,
"epoch": 1.2835820895522387,
"grad_norm": 2.835671901702881,
"learning_rate": 3.492154839842524e-07,
"loss": -0.001,
"num_tokens": 47337545.0,
"reward": 1.6829200685024261,
"reward_std": 0.11416030763536611,
"rewards/accuracy_reward": 0.34151785261929035,
"rewards/cosine_scaled_reward": 0.3414021451026201,
"rewards/format_reward": 1.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 20.23549175262451,
"epoch": 1.291044776119403,
"grad_norm": 2.3982043266296387,
"learning_rate": 3.430140857051674e-07,
"loss": 0.0006,
"num_tokens": 47485516.0,
"reward": 1.7543545216321945,
"reward_std": 0.08049214289215456,
"rewards/accuracy_reward": 0.3772321417927742,
"rewards/cosine_scaled_reward": 0.37712232768535614,
"rewards/format_reward": 1.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 20.290179014205933,
"epoch": 1.2985074626865671,
"grad_norm": 2.3194615840911865,
"learning_rate": 3.3683936339464955e-07,
"loss": -0.0009,
"num_tokens": 47626200.0,
"reward": 1.5512276887893677,
"reward_std": 0.05493424205126729,
"rewards/accuracy_reward": 0.2756696483120322,
"rewards/cosine_scaled_reward": 0.2755579724907875,
"rewards/format_reward": 1.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 20.627233028411865,
"epoch": 1.3059701492537314,
"grad_norm": 1.7407325506210327,
"learning_rate": 3.3069236629777884e-07,
"loss": 0.0019,
"num_tokens": 47773482.0,
"reward": 1.7253312766551971,
"reward_std": 0.04794870165083864,
"rewards/accuracy_reward": 0.3627232201397419,
"rewards/cosine_scaled_reward": 0.36260795034468174,
"rewards/format_reward": 1.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 20.26339364051819,
"epoch": 1.3134328358208955,
"grad_norm": 1.944441556930542,
"learning_rate": 3.2457413894840514e-07,
"loss": 0.0005,
"num_tokens": 47925630.0,
"reward": 1.542300522327423,
"reward_std": 0.04922672476845946,
"rewards/accuracy_reward": 0.2712053554132581,
"rewards/cosine_scaled_reward": 0.2710951156914234,
"rewards/format_reward": 1.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 20.04799175262451,
"epoch": 1.3208955223880596,
"grad_norm": 2.3487837314605713,
"learning_rate": 3.184857209916528e-07,
"loss": 0.0009,
"num_tokens": 48071681.0,
"reward": 1.5333750247955322,
"reward_std": 0.08439638444930608,
"rewards/accuracy_reward": 0.26674107275903225,
"rewards/cosine_scaled_reward": 0.2666339073330164,
"rewards/format_reward": 1.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 20.16294765472412,
"epoch": 1.328358208955224,
"grad_norm": 2.5152955055236816,
"learning_rate": 3.124281470072597e-07,
"loss": 0.0007,
"num_tokens": 48219603.0,
"reward": 1.6070343852043152,
"reward_std": 0.07860209648621108,
"rewards/accuracy_reward": 0.3035714291036129,
"rewards/cosine_scaled_reward": 0.3034628815948963,
"rewards/format_reward": 1.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 20.195312976837158,
"epoch": 1.335820895522388,
"grad_norm": 2.6069984436035156,
"learning_rate": 3.064024463337747e-07,
"loss": -0.0006,
"num_tokens": 48382522.0,
"reward": 1.7208726704120636,
"reward_std": 0.12136359186843038,
"rewards/accuracy_reward": 0.3604910708963871,
"rewards/cosine_scaled_reward": 0.3603815697133541,
"rewards/format_reward": 1.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 20.24330425262451,
"epoch": 1.3432835820895521,
"grad_norm": 3.2656219005584717,
"learning_rate": 3.004096428936461e-07,
"loss": -0.0005,
"num_tokens": 48544316.0,
"reward": 1.7164078652858734,
"reward_std": 0.12633045494445128,
"rewards/accuracy_reward": 0.3582589328289032,
"rewards/cosine_scaled_reward": 0.3581488821655512,
"rewards/format_reward": 1.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 20.943081378936768,
"epoch": 1.3507462686567164,
"grad_norm": 1.9599158763885498,
"learning_rate": 2.9445075501923176e-07,
"loss": 0.0005,
"num_tokens": 48698129.0,
"reward": 1.6896116733551025,
"reward_std": 0.055237259725728904,
"rewards/accuracy_reward": 0.3404017873108387,
"rewards/cosine_scaled_reward": 0.3492097966372967,
"rewards/format_reward": 1.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 20.816965103149414,
"epoch": 1.3582089552238805,
"grad_norm": 2.1912848949432373,
"learning_rate": 2.8852679527975685e-07,
"loss": 0.0008,
"num_tokens": 48846693.0,
"reward": 1.5489892959594727,
"reward_std": 0.07417789786538975,
"rewards/accuracy_reward": 0.2745535736903548,
"rewards/cosine_scaled_reward": 0.27443567011505365,
"rewards/format_reward": 1.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 20.80022382736206,
"epoch": 1.3656716417910448,
"grad_norm": 2.0120460987091064,
"learning_rate": 2.8263877030925277e-07,
"loss": -0.0003,
"num_tokens": 48993618.0,
"reward": 1.6047926098108292,
"reward_std": 0.08897415082109461,
"rewards/accuracy_reward": 0.3024553554132581,
"rewards/cosine_scaled_reward": 0.30233720503747463,
"rewards/format_reward": 1.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 20.600447416305542,
"epoch": 1.373134328358209,
"grad_norm": 2.198028087615967,
"learning_rate": 2.767876806355045e-07,
"loss": 0.0011,
"num_tokens": 49145180.0,
"reward": 1.5378303229808807,
"reward_std": 0.10318562714383006,
"rewards/accuracy_reward": 0.2689732126891613,
"rewards/cosine_scaled_reward": 0.2688570562750101,
"rewards/format_reward": 1.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 20.30580472946167,
"epoch": 1.3805970149253732,
"grad_norm": 3.135545015335083,
"learning_rate": 2.709745205100337e-07,
"loss": 0.0007,
"num_tokens": 49288630.0,
"reward": 1.6762290000915527,
"reward_std": 0.10257845791056752,
"rewards/accuracy_reward": 0.3381696464493871,
"rewards/cosine_scaled_reward": 0.3380592940375209,
"rewards/format_reward": 1.0,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 20.150670051574707,
"epoch": 1.3880597014925373,
"grad_norm": 2.3859987258911133,
"learning_rate": 2.652002777391507e-07,
"loss": 0.0003,
"num_tokens": 49430613.0,
"reward": 1.7298022359609604,
"reward_std": 0.09394080052152276,
"rewards/accuracy_reward": 0.36495535634458065,
"rewards/cosine_scaled_reward": 0.36484682094305754,
"rewards/format_reward": 1.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 20.50892972946167,
"epoch": 1.3955223880597014,
"grad_norm": 2.2975528240203857,
"learning_rate": 2.594659335161008e-07,
"loss": -0.0025,
"num_tokens": 49577965.0,
"reward": 1.555689975619316,
"reward_std": 0.07875558780506253,
"rewards/accuracy_reward": 0.2779017873108387,
"rewards/cosine_scaled_reward": 0.2777881510555744,
"rewards/format_reward": 1.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 20.405134916305542,
"epoch": 1.4029850746268657,
"grad_norm": 1.598307490348816,
"learning_rate": 2.5377246225433304e-07,
"loss": -0.0001,
"num_tokens": 49731048.0,
"reward": 1.6873885244131088,
"reward_std": 0.061549990693965384,
"rewards/accuracy_reward": 0.34375,
"rewards/cosine_scaled_reward": 0.3436384294182062,
"rewards/format_reward": 1.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 20.441964864730835,
"epoch": 1.4104477611940298,
"grad_norm": 1.098617434501648,
"learning_rate": 2.4812083142192323e-07,
"loss": -0.0004,
"num_tokens": 49875452.0,
"reward": 1.6940844804048538,
"reward_std": 0.03013577858569505,
"rewards/accuracy_reward": 0.34709821455180645,
"rewards/cosine_scaled_reward": 0.34698620066046715,
"rewards/format_reward": 1.0,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 20.527902841567993,
"epoch": 1.417910447761194,
"grad_norm": 2.504321336746216,
"learning_rate": 2.4251200137717543e-07,
"loss": 0.0003,
"num_tokens": 50048789.0,
"reward": 1.6360465586185455,
"reward_std": 0.08695389210011228,
"rewards/accuracy_reward": 0.3180803582072258,
"rewards/cosine_scaled_reward": 0.3179661240428686,
"rewards/format_reward": 1.0,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 20.17299175262451,
"epoch": 1.4253731343283582,
"grad_norm": 2.5839359760284424,
"learning_rate": 2.3694692520543292e-07,
"loss": 0.0013,
"num_tokens": 50197200.0,
"reward": 1.6204270422458649,
"reward_std": 0.07372181725033755,
"rewards/accuracy_reward": 0.31026786006987095,
"rewards/cosine_scaled_reward": 0.31015911884605885,
"rewards/format_reward": 1.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 20.678572416305542,
"epoch": 1.4328358208955223,
"grad_norm": 2.182985544204712,
"learning_rate": 2.314265485571235e-07,
"loss": 0.0091,
"num_tokens": 50350120.0,
"reward": 1.66729336977005,
"reward_std": 0.08215013663391346,
"rewards/accuracy_reward": 0.3337053582072258,
"rewards/cosine_scaled_reward": 0.3335879575461149,
"rewards/format_reward": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 20.617188692092896,
"epoch": 1.4402985074626866,
"grad_norm": 1.8787038326263428,
"learning_rate": 2.2595180948706926e-07,
"loss": 0.0005,
"num_tokens": 50507097.0,
"reward": 1.6829163581132889,
"reward_std": 0.03772871130308175,
"rewards/accuracy_reward": 0.34151786006987095,
"rewards/cosine_scaled_reward": 0.34139839746057987,
"rewards/format_reward": 1.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 20.189732551574707,
"epoch": 1.4477611940298507,
"grad_norm": 1.7796034812927246,
"learning_rate": 2.2052363829508776e-07,
"loss": -0.0002,
"num_tokens": 50667171.0,
"reward": 1.7521222680807114,
"reward_std": 0.05102925866069086,
"rewards/accuracy_reward": 0.37611608020961285,
"rewards/cosine_scaled_reward": 0.37600609846413136,
"rewards/format_reward": 1.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 20.324777603149414,
"epoch": 1.455223880597015,
"grad_norm": 2.2849700450897217,
"learning_rate": 2.1514295736790838e-07,
"loss": 0.0008,
"num_tokens": 50814222.0,
"reward": 1.735380157828331,
"reward_std": 0.09453902203552644,
"rewards/accuracy_reward": 0.36830357275903225,
"rewards/cosine_scaled_reward": 0.3681926131248474,
"rewards/format_reward": 0.9988839253783226,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 20.19084906578064,
"epoch": 1.462686567164179,
"grad_norm": 3.1908223628997803,
"learning_rate": 2.0981068102243616e-07,
"loss": 0.0004,
"num_tokens": 50957049.0,
"reward": 1.6472126096487045,
"reward_std": 0.09634861818715024,
"rewards/accuracy_reward": 0.32366071455180645,
"rewards/cosine_scaled_reward": 0.323551825247705,
"rewards/format_reward": 1.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 20.42299199104309,
"epoch": 1.4701492537313432,
"grad_norm": 1.6977643966674805,
"learning_rate": 2.0452771535038515e-07,
"loss": 0.0007,
"num_tokens": 51099484.0,
"reward": 1.54006627202034,
"reward_std": 0.043066854786879105,
"rewards/accuracy_reward": 0.27008928917348385,
"rewards/cosine_scaled_reward": 0.26997692696750164,
"rewards/format_reward": 1.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 20.802456378936768,
"epoch": 1.4776119402985075,
"grad_norm": 2.020225763320923,
"learning_rate": 1.9929495806431023e-07,
"loss": 0.0003,
"num_tokens": 51261563.0,
"reward": 1.6829185336828232,
"reward_std": 0.058621840249692525,
"rewards/accuracy_reward": 0.3415178619325161,
"rewards/cosine_scaled_reward": 0.3414005693048239,
"rewards/format_reward": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 21.07924199104309,
"epoch": 1.4850746268656716,
"grad_norm": 1.6952190399169922,
"learning_rate": 1.9411329834506286e-07,
"loss": 0.0012,
"num_tokens": 51404026.0,
"reward": 1.4038956314325333,
"reward_std": 0.053959765473592824,
"rewards/accuracy_reward": 0.2020089328289032,
"rewards/cosine_scaled_reward": 0.20188664738088846,
"rewards/format_reward": 1.0,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 20.199777841567993,
"epoch": 1.4925373134328357,
"grad_norm": 2.445509433746338,
"learning_rate": 1.8898361669069497e-07,
"loss": -0.0013,
"num_tokens": 51564021.0,
"reward": 1.640516072511673,
"reward_std": 0.09235968008678697,
"rewards/accuracy_reward": 0.32031249813735485,
"rewards/cosine_scaled_reward": 0.3202035166323185,
"rewards/format_reward": 1.0,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 20.577009916305542,
"epoch": 1.5,
"grad_norm": 1.4149043560028076,
"learning_rate": 1.8390678476684142e-07,
"loss": 0.0007,
"num_tokens": 51712674.0,
"reward": 1.5110464841127396,
"reward_std": 0.038402879612469576,
"rewards/accuracy_reward": 0.2555803582072258,
"rewards/cosine_scaled_reward": 0.2554660662135575,
"rewards/format_reward": 1.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 20.31361699104309,
"epoch": 1.5074626865671643,
"grad_norm": 1.3769646883010864,
"learning_rate": 1.7888366525859967e-07,
"loss": 0.0003,
"num_tokens": 51862699.0,
"reward": 1.7298002988100052,
"reward_std": 0.03547355129772001,
"rewards/accuracy_reward": 0.3649553656578064,
"rewards/cosine_scaled_reward": 0.3648448847234249,
"rewards/format_reward": 1.0,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 20.623884916305542,
"epoch": 1.5149253731343284,
"grad_norm": 3.1723361015319824,
"learning_rate": 1.7391511172393848e-07,
"loss": 0.0012,
"num_tokens": 52014514.0,
"reward": 1.5690792500972748,
"reward_std": 0.09754315220763488,
"rewards/accuracy_reward": 0.28459821082651615,
"rewards/cosine_scaled_reward": 0.2844809675589204,
"rewards/format_reward": 1.0,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 20.436384916305542,
"epoch": 1.5223880597014925,
"grad_norm": 1.6968096494674683,
"learning_rate": 1.690019684486557e-07,
"loss": -0.0003,
"num_tokens": 52161273.0,
"reward": 1.7119403928518295,
"reward_std": 0.031414411859074676,
"rewards/accuracy_reward": 0.35602678544819355,
"rewards/cosine_scaled_reward": 0.3559135627001524,
"rewards/format_reward": 1.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 20.014509677886963,
"epoch": 1.5298507462686568,
"grad_norm": 2.15055513381958,
"learning_rate": 1.6414507030291246e-07,
"loss": 0.0008,
"num_tokens": 52310342.0,
"reward": 1.7543574571609497,
"reward_std": 0.06756016856554936,
"rewards/accuracy_reward": 0.3772321417927742,
"rewards/cosine_scaled_reward": 0.3771252781152725,
"rewards/format_reward": 1.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 20.49553632736206,
"epoch": 1.537313432835821,
"grad_norm": 2.6926231384277344,
"learning_rate": 1.5934524259936753e-07,
"loss": -0.0012,
"num_tokens": 52466618.0,
"reward": 1.5155121833086014,
"reward_std": 0.1101713702082634,
"rewards/accuracy_reward": 0.2578125009313226,
"rewards/cosine_scaled_reward": 0.25769960321485996,
"rewards/format_reward": 1.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 20.606027364730835,
"epoch": 1.544776119402985,
"grad_norm": 2.4713823795318604,
"learning_rate": 1.5460330095293443e-07,
"loss": 0.0008,
"num_tokens": 52628361.0,
"reward": 1.767742782831192,
"reward_std": 0.07515440785066829,
"rewards/accuracy_reward": 0.38392857275903225,
"rewards/cosine_scaled_reward": 0.38381412625312805,
"rewards/format_reward": 1.0,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 20.731027603149414,
"epoch": 1.5522388059701493,
"grad_norm": 2.261600971221924,
"learning_rate": 1.4992005114218804e-07,
"loss": 0.0009,
"num_tokens": 52775840.0,
"reward": 1.6472041308879852,
"reward_std": 0.07109666805187231,
"rewards/accuracy_reward": 0.32366071827709675,
"rewards/cosine_scaled_reward": 0.32354335859417915,
"rewards/format_reward": 1.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 21.045759677886963,
"epoch": 1.5597014925373134,
"grad_norm": 0.7100464105606079,
"learning_rate": 1.4529628897244212e-07,
"loss": 0.0004,
"num_tokens": 52938273.0,
"reward": 1.6248737573623657,
"reward_std": 0.012628240511414646,
"rewards/accuracy_reward": 0.31250000838190317,
"rewards/cosine_scaled_reward": 0.31237365305423737,
"rewards/format_reward": 1.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 20.343751192092896,
"epoch": 1.5671641791044775,
"grad_norm": 1.3848381042480469,
"learning_rate": 1.4073280014052074e-07,
"loss": 0.0004,
"num_tokens": 53086589.0,
"reward": 1.736496239900589,
"reward_std": 0.02916058116019471,
"rewards/accuracy_reward": 0.36830357648432255,
"rewards/cosine_scaled_reward": 0.36819261126220226,
"rewards/format_reward": 1.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 20.465402841567993,
"epoch": 1.5746268656716418,
"grad_norm": 2.789355993270874,
"learning_rate": 1.3623036010124845e-07,
"loss": 0.0004,
"num_tokens": 53244486.0,
"reward": 1.7097086608409882,
"reward_std": 0.08943129004910588,
"rewards/accuracy_reward": 0.3549107173457742,
"rewards/cosine_scaled_reward": 0.35479787550866604,
"rewards/format_reward": 1.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 20.30022406578064,
"epoch": 1.582089552238806,
"grad_norm": 2.885791778564453,
"learning_rate": 1.3178973393568056e-07,
"loss": 0.0001,
"num_tokens": 53394467.0,
"reward": 1.6762287318706512,
"reward_std": 0.08162061781850127,
"rewards/accuracy_reward": 0.33816964738070965,
"rewards/cosine_scaled_reward": 0.3380589783191681,
"rewards/format_reward": 1.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 20.32366132736206,
"epoch": 1.5895522388059702,
"grad_norm": 2.8630964756011963,
"learning_rate": 1.2741167622109555e-07,
"loss": 0.0,
"num_tokens": 53546981.0,
"reward": 1.6427462249994278,
"reward_std": 0.058320232714407894,
"rewards/accuracy_reward": 0.3214285708963871,
"rewards/cosine_scaled_reward": 0.3213176503777504,
"rewards/format_reward": 1.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 20.03236675262451,
"epoch": 1.5970149253731343,
"grad_norm": 2.2839713096618652,
"learning_rate": 1.230969309027739e-07,
"loss": -0.001,
"num_tokens": 53698458.0,
"reward": 1.6181965470314026,
"reward_std": 0.07244142005220056,
"rewards/accuracy_reward": 0.30915178707800806,
"rewards/cosine_scaled_reward": 0.3090446996502578,
"rewards/format_reward": 1.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 20.234375953674316,
"epoch": 1.6044776119402986,
"grad_norm": 2.546302556991577,
"learning_rate": 1.1884623116758119e-07,
"loss": 0.0002,
"num_tokens": 53849116.0,
"reward": 1.651675522327423,
"reward_std": 0.0710948963102993,
"rewards/accuracy_reward": 0.32589285634458065,
"rewards/cosine_scaled_reward": 0.3257826119661331,
"rewards/format_reward": 1.0,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 20.366072177886963,
"epoch": 1.6119402985074627,
"grad_norm": 2.6176400184631348,
"learning_rate": 1.1466029931938181e-07,
"loss": 0.0007,
"num_tokens": 53999708.0,
"reward": 1.6025667041540146,
"reward_std": 0.055539907814715406,
"rewards/accuracy_reward": 0.30133928917348385,
"rewards/cosine_scaled_reward": 0.30122734420001507,
"rewards/format_reward": 1.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 20.61830449104309,
"epoch": 1.6194029850746268,
"grad_norm": 2.8905839920043945,
"learning_rate": 1.1053984665630023e-07,
"loss": 0.0013,
"num_tokens": 54147318.0,
"reward": 1.6003320217132568,
"reward_std": 0.12910877341846927,
"rewards/accuracy_reward": 0.30022321455180645,
"rewards/cosine_scaled_reward": 0.3001087475568056,
"rewards/format_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 20.37834930419922,
"epoch": 1.626865671641791,
"grad_norm": 2.013456106185913,
"learning_rate": 1.0648557334985308e-07,
"loss": 0.001,
"num_tokens": 54315337.0,
"reward": 1.3972100466489792,
"reward_std": 0.08131316915778086,
"rewards/accuracy_reward": 0.19866071827709675,
"rewards/cosine_scaled_reward": 0.1985492706298828,
"rewards/format_reward": 1.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 20.402902841567993,
"epoch": 1.6343283582089554,
"grad_norm": 2.0653622150421143,
"learning_rate": 1.024981683259723e-07,
"loss": 0.0005,
"num_tokens": 54458594.0,
"reward": 1.7476565390825272,
"reward_std": 0.05102954798443449,
"rewards/accuracy_reward": 0.3738839291036129,
"rewards/cosine_scaled_reward": 0.37377250753343105,
"rewards/format_reward": 1.0,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 20.71651840209961,
"epoch": 1.6417910447761193,
"grad_norm": 1.5992612838745117,
"learning_rate": 9.857830914793824e-08,
"loss": 0.0009,
"num_tokens": 54607180.0,
"reward": 1.7409540712833405,
"reward_std": 0.04907748210338525,
"rewards/accuracy_reward": 0.37053571827709675,
"rewards/cosine_scaled_reward": 0.37041825242340565,
"rewards/format_reward": 1.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 20.84709906578064,
"epoch": 1.6492537313432836,
"grad_norm": 1.980920672416687,
"learning_rate": 9.472666190124456e-08,
"loss": 0.0006,
"num_tokens": 54749371.0,
"reward": 1.746531069278717,
"reward_std": 0.06538078441796102,
"rewards/accuracy_reward": 0.37388392724096775,
"rewards/cosine_scaled_reward": 0.3737631347030401,
"rewards/format_reward": 0.9988839253783226,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 20.19642925262451,
"epoch": 1.6567164179104479,
"grad_norm": 2.4586503505706787,
"learning_rate": 9.094388108041301e-08,
"loss": -0.0002,
"num_tokens": 54894707.0,
"reward": 1.6717657148838043,
"reward_std": 0.07176896455235493,
"rewards/accuracy_reward": 0.3359375037252903,
"rewards/cosine_scaled_reward": 0.3358281459659338,
"rewards/format_reward": 1.0,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 19.99776864051819,
"epoch": 1.664179104477612,
"grad_norm": 1.5232560634613037,
"learning_rate": 8.723060947777777e-08,
"loss": 0.001,
"num_tokens": 55046777.0,
"reward": 1.4195362627506256,
"reward_std": 0.05862198262564533,
"rewards/accuracy_reward": 0.2098214307334274,
"rewards/cosine_scaled_reward": 0.2097147584427148,
"rewards/format_reward": 1.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 20.31808066368103,
"epoch": 1.671641791044776,
"grad_norm": 1.8125054836273193,
"learning_rate": 8.358747807425826e-08,
"loss": 0.0004,
"num_tokens": 55197894.0,
"reward": 1.6070322841405869,
"reward_std": 0.06688734842464328,
"rewards/accuracy_reward": 0.30357143096625805,
"rewards/cosine_scaled_reward": 0.3034607693552971,
"rewards/format_reward": 1.0,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 20.676340341567993,
"epoch": 1.6791044776119404,
"grad_norm": 2.117981195449829,
"learning_rate": 8.001510593213945e-08,
"loss": 0.0009,
"num_tokens": 55361956.0,
"reward": 1.5824733972549438,
"reward_std": 0.061247594597631405,
"rewards/accuracy_reward": 0.2912946464493871,
"rewards/cosine_scaled_reward": 0.2911787135526538,
"rewards/format_reward": 1.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 20.402902603149414,
"epoch": 1.6865671641791045,
"grad_norm": 2.322235584259033,
"learning_rate": 7.651410008987697e-08,
"loss": -0.0005,
"num_tokens": 55534853.0,
"reward": 1.5400669574737549,
"reward_std": 0.07680402030835864,
"rewards/accuracy_reward": 0.2700892873108387,
"rewards/cosine_scaled_reward": 0.26997758261859417,
"rewards/format_reward": 1.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 20.39732265472412,
"epoch": 1.6940298507462686,
"grad_norm": 2.2915542125701904,
"learning_rate": 7.308505545894566e-08,
"loss": 0.0015,
"num_tokens": 55689753.0,
"reward": 1.5065849125385284,
"reward_std": 0.05005305098056567,
"rewards/accuracy_reward": 0.2533482164144516,
"rewards/cosine_scaled_reward": 0.25323665514588356,
"rewards/format_reward": 1.0,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 20.694197177886963,
"epoch": 1.7014925373134329,
"grad_norm": 0.9342123866081238,
"learning_rate": 6.972855472274852e-08,
"loss": 0.0018,
"num_tokens": 55839255.0,
"reward": 1.5266692787408829,
"reward_std": 0.020895601193345215,
"rewards/accuracy_reward": 0.2633928619325161,
"rewards/cosine_scaled_reward": 0.2632763609290123,
"rewards/format_reward": 1.0,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 20.92299199104309,
"epoch": 1.7089552238805972,
"grad_norm": 1.9537469148635864,
"learning_rate": 6.644516823760437e-08,
"loss": 0.0,
"num_tokens": 55992818.0,
"reward": 1.6561282128095627,
"reward_std": 0.07515494169327752,
"rewards/accuracy_reward": 0.3281250037252903,
"rewards/cosine_scaled_reward": 0.3280031867325306,
"rewards/format_reward": 1.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 20.45870614051819,
"epoch": 1.716417910447761,
"grad_norm": 2.364659547805786,
"learning_rate": 6.323545393582847e-08,
"loss": 0.0011,
"num_tokens": 56149261.0,
"reward": 1.718636766076088,
"reward_std": 0.12023507321784876,
"rewards/accuracy_reward": 0.35937500186264515,
"rewards/cosine_scaled_reward": 0.35926168598234653,
"rewards/format_reward": 1.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 20.56584858894348,
"epoch": 1.7238805970149254,
"grad_norm": 3.120504379272461,
"learning_rate": 6.009995723092653e-08,
"loss": -0.0001,
"num_tokens": 56305584.0,
"reward": 1.8458667993545532,
"reward_std": 0.08213974455068751,
"rewards/accuracy_reward": 0.42299107275903225,
"rewards/cosine_scaled_reward": 0.42287569493055344,
"rewards/format_reward": 1.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 20.895090103149414,
"epoch": 1.7313432835820897,
"grad_norm": 1.5434519052505493,
"learning_rate": 5.703921092491393e-08,
"loss": 0.0003,
"num_tokens": 56468698.0,
"reward": 1.6784496754407883,
"reward_std": 0.029160332879651918,
"rewards/accuracy_reward": 0.3392857192084193,
"rewards/cosine_scaled_reward": 0.3391639143228531,
"rewards/format_reward": 1.0,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 20.31584906578064,
"epoch": 1.7388059701492538,
"grad_norm": 2.2976887226104736,
"learning_rate": 5.405373511777939e-08,
"loss": 0.0003,
"num_tokens": 56622173.0,
"reward": 1.6114967614412308,
"reward_std": 0.07515410965470437,
"rewards/accuracy_reward": 0.30580357275903225,
"rewards/cosine_scaled_reward": 0.30569313652813435,
"rewards/format_reward": 1.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 20.062500715255737,
"epoch": 1.7462686567164178,
"grad_norm": 2.2692880630493164,
"learning_rate": 5.114403711910631e-08,
"loss": 0.0004,
"num_tokens": 56781293.0,
"reward": 1.6472149044275284,
"reward_std": 0.07417689614470646,
"rewards/accuracy_reward": 0.32194368727505207,
"rewards/cosine_scaled_reward": 0.328018419444561,
"rewards/format_reward": 1.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 20.42745614051819,
"epoch": 1.7537313432835822,
"grad_norm": 1.9118342399597168,
"learning_rate": 4.831061136186787e-08,
"loss": -0.0002,
"num_tokens": 56925372.0,
"reward": 1.6449773013591766,
"reward_std": 0.06771036455336343,
"rewards/accuracy_reward": 0.3225446417927742,
"rewards/cosine_scaled_reward": 0.322432579472661,
"rewards/format_reward": 1.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 20.910715103149414,
"epoch": 1.7611940298507462,
"grad_norm": 2.7811856269836426,
"learning_rate": 4.5553939318410004e-08,
"loss": 0.0011,
"num_tokens": 57072716.0,
"reward": 1.825772985816002,
"reward_std": 0.09235973202066816,
"rewards/accuracy_reward": 0.4129464291036129,
"rewards/cosine_scaled_reward": 0.4128264728933573,
"rewards/format_reward": 1.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 20.27678656578064,
"epoch": 1.7686567164179103,
"grad_norm": 1.4767699241638184,
"learning_rate": 4.287448941863692e-08,
"loss": 0.0006,
"num_tokens": 57219380.0,
"reward": 1.6014523804187775,
"reward_std": 0.04930526966539617,
"rewards/accuracy_reward": 0.30133928917348385,
"rewards/cosine_scaled_reward": 0.30122908018529415,
"rewards/format_reward": 0.9988839253783226,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 20.325893878936768,
"epoch": 1.7761194029850746,
"grad_norm": 3.0593392848968506,
"learning_rate": 4.0272716970412516e-08,
"loss": -0.0001,
"num_tokens": 57371640.0,
"reward": 1.5713180601596832,
"reward_std": 0.124595548491925,
"rewards/accuracy_reward": 0.2857142901048064,
"rewards/cosine_scaled_reward": 0.28560364712029696,
"rewards/format_reward": 1.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 20.405134677886963,
"epoch": 1.783582089552239,
"grad_norm": 2.0308165550231934,
"learning_rate": 3.774906408219197e-08,
"loss": 0.0018,
"num_tokens": 57518011.0,
"reward": 1.6494415253400803,
"reward_std": 0.04959785374813919,
"rewards/accuracy_reward": 0.3247767873108387,
"rewards/cosine_scaled_reward": 0.32466465793550014,
"rewards/format_reward": 1.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 20.67745614051819,
"epoch": 1.7910447761194028,
"grad_norm": 1.7336244583129883,
"learning_rate": 3.5303959587895896e-08,
"loss": 0.0034,
"num_tokens": 57677018.0,
"reward": 1.591400220990181,
"reward_std": 0.03742815442538472,
"rewards/accuracy_reward": 0.29575893096625805,
"rewards/cosine_scaled_reward": 0.2956412099301815,
"rewards/format_reward": 1.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 20.30803656578064,
"epoch": 1.7985074626865671,
"grad_norm": 2.150383234024048,
"learning_rate": 3.293781897404063e-08,
"loss": -0.001,
"num_tokens": 57828678.0,
"reward": 1.6829252541065216,
"reward_std": 0.06816608617647546,
"rewards/accuracy_reward": 0.34151785634458065,
"rewards/cosine_scaled_reward": 0.34140734374523163,
"rewards/format_reward": 1.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 20.556920528411865,
"epoch": 1.8059701492537314,
"grad_norm": 1.195513367652893,
"learning_rate": 3.065104430913601e-08,
"loss": 0.0005,
"num_tokens": 57977305.0,
"reward": 1.6561355143785477,
"reward_std": 0.041787590547230025,
"rewards/accuracy_reward": 0.3281250074505806,
"rewards/cosine_scaled_reward": 0.328010406345129,
"rewards/format_reward": 1.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 20.666295289993286,
"epoch": 1.8134328358208955,
"grad_norm": 2.063978672027588,
"learning_rate": 2.8444024175363733e-08,
"loss": 0.0002,
"num_tokens": 58129758.0,
"reward": 1.6315756142139435,
"reward_std": 0.07628487978815457,
"rewards/accuracy_reward": 0.31584821455180645,
"rewards/cosine_scaled_reward": 0.31572734005749226,
"rewards/format_reward": 1.0,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 20.343751192092896,
"epoch": 1.8208955223880596,
"grad_norm": 2.139723777770996,
"learning_rate": 2.6317133602547335e-08,
"loss": -0.0,
"num_tokens": 58286354.0,
"reward": 1.6070320904254913,
"reward_std": 0.06395891746558391,
"rewards/accuracy_reward": 0.3035714291036129,
"rewards/cosine_scaled_reward": 0.30346059799194336,
"rewards/format_reward": 1.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 20.492188453674316,
"epoch": 1.828358208955224,
"grad_norm": 1.4515061378479004,
"learning_rate": 2.4270734004424643e-08,
"loss": 0.0004,
"num_tokens": 58436883.0,
"reward": 1.624885842204094,
"reward_std": 0.05441444956320396,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.3123858105391264,
"rewards/format_reward": 1.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 20.328125953674316,
"epoch": 1.835820895522388,
"grad_norm": 1.8083621263504028,
"learning_rate": 2.2305173117234233e-08,
"loss": -0.0003,
"num_tokens": 58590025.0,
"reward": 1.6539071798324585,
"reward_std": 0.06463310816476309,
"rewards/accuracy_reward": 0.32700893096625805,
"rewards/cosine_scaled_reward": 0.3268981762230396,
"rewards/format_reward": 1.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 20.55580449104309,
"epoch": 1.8432835820895521,
"grad_norm": 2.2442469596862793,
"learning_rate": 2.0420784940626156e-08,
"loss": 0.0013,
"num_tokens": 58742411.0,
"reward": 1.714171290397644,
"reward_std": 0.08214285858039716,
"rewards/accuracy_reward": 0.3571428544819355,
"rewards/cosine_scaled_reward": 0.35702834837138653,
"rewards/format_reward": 1.0,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 20.25111675262451,
"epoch": 1.8507462686567164,
"grad_norm": 2.635986804962158,
"learning_rate": 1.861788968090683e-08,
"loss": -0.0002,
"num_tokens": 58885124.0,
"reward": 1.6583720594644547,
"reward_std": 0.09363627548930253,
"rewards/accuracy_reward": 0.32924107275903225,
"rewards/cosine_scaled_reward": 0.32913094013929367,
"rewards/format_reward": 1.0,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 20.059152364730835,
"epoch": 1.8582089552238807,
"grad_norm": 2.7098944187164307,
"learning_rate": 1.68967936966275e-08,
"loss": 0.0005,
"num_tokens": 59042857.0,
"reward": 1.647214189171791,
"reward_std": 0.0797318636930413,
"rewards/accuracy_reward": 0.32366071455180645,
"rewards/cosine_scaled_reward": 0.3235534243285656,
"rewards/format_reward": 1.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 20.43861699104309,
"epoch": 1.8656716417910446,
"grad_norm": 2.948129177093506,
"learning_rate": 1.525778944652617e-08,
"loss": -0.0017,
"num_tokens": 59189866.0,
"reward": 1.6047987192869186,
"reward_std": 0.09995094314217567,
"rewards/accuracy_reward": 0.30245535261929035,
"rewards/cosine_scaled_reward": 0.30234329774975777,
"rewards/format_reward": 1.0,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 20.51116156578064,
"epoch": 1.873134328358209,
"grad_norm": 1.8753550052642822,
"learning_rate": 1.3701155439831248e-08,
"loss": 0.0003,
"num_tokens": 59351876.0,
"reward": 1.4842608720064163,
"reward_std": 0.05374052021700493,
"rewards/accuracy_reward": 0.24218750186264515,
"rewards/cosine_scaled_reward": 0.2420733030885458,
"rewards/format_reward": 1.0,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 19.960938692092896,
"epoch": 1.8805970149253732,
"grad_norm": 3.710313320159912,
"learning_rate": 1.222715618893555e-08,
"loss": -0.0011,
"num_tokens": 59501249.0,
"reward": 1.6673045605421066,
"reward_std": 0.11986687686294317,
"rewards/accuracy_reward": 0.33370536379516125,
"rewards/cosine_scaled_reward": 0.3335991408675909,
"rewards/format_reward": 1.0,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 20.74553632736206,
"epoch": 1.8880597014925373,
"grad_norm": 2.096618890762329,
"learning_rate": 1.0836042164448944e-08,
"loss": 0.0005,
"num_tokens": 59655845.0,
"reward": 1.5623822510242462,
"reward_std": 0.06011815097401296,
"rewards/accuracy_reward": 0.2812500037252903,
"rewards/cosine_scaled_reward": 0.28113218024373055,
"rewards/format_reward": 1.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 20.500000953674316,
"epoch": 1.8955223880597014,
"grad_norm": 2.3821330070495605,
"learning_rate": 9.528049752636714e-09,
"loss": 0.0004,
"num_tokens": 59810181.0,
"reward": 1.747654750943184,
"reward_std": 0.0755647381696889,
"rewards/accuracy_reward": 0.37611607275903225,
"rewards/cosine_scaled_reward": 0.37600288540124893,
"rewards/format_reward": 0.9955357164144516,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 20.908483266830444,
"epoch": 1.9029850746268657,
"grad_norm": 1.859230875968933,
"learning_rate": 8.303401215251581e-09,
"loss": 0.0002,
"num_tokens": 59954931.0,
"reward": 1.600327655673027,
"reward_std": 0.03937964968498875,
"rewards/accuracy_reward": 0.3002232201397419,
"rewards/cosine_scaled_reward": 0.3001043573021889,
"rewards/format_reward": 1.0,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 20.248884677886963,
"epoch": 1.9104477611940298,
"grad_norm": 2.67840313911438,
"learning_rate": 7.1623046517656495e-09,
"loss": 0.0004,
"num_tokens": 60097050.0,
"reward": 1.758818194270134,
"reward_std": 0.06981627906458954,
"rewards/accuracy_reward": 0.3794642873108387,
"rewards/cosine_scaled_reward": 0.37935382314026356,
"rewards/format_reward": 1.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 20.142858266830444,
"epoch": 1.917910447761194,
"grad_norm": 2.1021740436553955,
"learning_rate": 6.104953964008897e-09,
"loss": 0.0007,
"num_tokens": 60255522.0,
"reward": 1.5333734452724457,
"reward_std": 0.08875712241180622,
"rewards/accuracy_reward": 0.26674107648432255,
"rewards/cosine_scaled_reward": 0.26663233898580074,
"rewards/format_reward": 1.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 20.85267949104309,
"epoch": 1.9253731343283582,
"grad_norm": 2.7934389114379883,
"learning_rate": 5.131528823220099e-09,
"loss": -0.0019,
"num_tokens": 60410742.0,
"reward": 1.670636236667633,
"reward_std": 0.08307532503371817,
"rewards/accuracy_reward": 0.3348214328289032,
"rewards/cosine_scaled_reward": 0.3358147069811821,
"rewards/format_reward": 1.0,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 20.453125953674316,
"epoch": 1.9328358208955225,
"grad_norm": 2.496358871459961,
"learning_rate": 4.242194639516416e-09,
"loss": 0.0009,
"num_tokens": 60571516.0,
"reward": 1.6360480785369873,
"reward_std": 0.07417896673651114,
"rewards/accuracy_reward": 0.31808036006987095,
"rewards/cosine_scaled_reward": 0.3179676216095686,
"rewards/format_reward": 1.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 20.14955425262451,
"epoch": 1.9402985074626866,
"grad_norm": 1.2594853639602661,
"learning_rate": 3.4371025337855407e-09,
"loss": 0.0003,
"num_tokens": 60722354.0,
"reward": 1.586945116519928,
"reward_std": 0.03156871721012067,
"rewards/accuracy_reward": 0.29352678975556046,
"rewards/cosine_scaled_reward": 0.29341828147880733,
"rewards/format_reward": 1.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 20.205358266830444,
"epoch": 1.9477611940298507,
"grad_norm": 1.8333399295806885,
"learning_rate": 2.7163893120066285e-09,
"loss": 0.0005,
"num_tokens": 60871290.0,
"reward": 1.5088191330432892,
"reward_std": 0.052982652708983835,
"rewards/accuracy_reward": 0.2544642798602581,
"rewards/cosine_scaled_reward": 0.25435481034219265,
"rewards/format_reward": 1.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 20.30803656578064,
"epoch": 1.955223880597015,
"grad_norm": 2.2571942806243896,
"learning_rate": 2.080177442003117e-09,
"loss": -0.0004,
"num_tokens": 61015958.0,
"reward": 1.6137285828590393,
"reward_std": 0.1302357604727149,
"rewards/accuracy_reward": 0.3069196389988065,
"rewards/cosine_scaled_reward": 0.30680886935442686,
"rewards/format_reward": 1.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 20.48214340209961,
"epoch": 1.962686567164179,
"grad_norm": 1.8233392238616943,
"learning_rate": 1.5285750326325953e-09,
"loss": 0.0001,
"num_tokens": 61160438.0,
"reward": 1.6360481083393097,
"reward_std": 0.04794773051276735,
"rewards/accuracy_reward": 0.31808035634458065,
"rewards/cosine_scaled_reward": 0.31796768493950367,
"rewards/format_reward": 1.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 21.101563215255737,
"epoch": 1.9701492537313432,
"grad_norm": 1.5878843069076538,
"learning_rate": 1.0616758154161631e-09,
"loss": 0.0006,
"num_tokens": 61316161.0,
"reward": 1.6092515885829926,
"reward_std": 0.0644803009436572,
"rewards/accuracy_reward": 0.30468749441206455,
"rewards/cosine_scaled_reward": 0.30456401966512203,
"rewards/format_reward": 1.0,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 20.38839364051819,
"epoch": 1.9776119402985075,
"grad_norm": 2.332639455795288,
"learning_rate": 6.795591286109514e-10,
"loss": -0.0004,
"num_tokens": 61460901.0,
"reward": 1.490959793329239,
"reward_std": 0.07921304133695628,
"rewards/accuracy_reward": 0.24553572060540318,
"rewards/cosine_scaled_reward": 0.24542399495840073,
"rewards/format_reward": 1.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 20.500001192092896,
"epoch": 1.9850746268656716,
"grad_norm": 1.9155348539352417,
"learning_rate": 3.8228990372862756e-10,
"loss": 0.0008,
"num_tokens": 61616757.0,
"reward": 1.624886617064476,
"reward_std": 0.07289814859302624,
"rewards/accuracy_reward": 0.3125,
"rewards/cosine_scaled_reward": 0.31238655652850866,
"rewards/format_reward": 1.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 20.14851450920105,
"epoch": 1.9925373134328357,
"grad_norm": 2.2550103664398193,
"learning_rate": 1.6991865450188825e-10,
"loss": 0.0002,
"num_tokens": 61771838.0,
"reward": 1.6784630566835403,
"reward_std": 0.06463183751365165,
"rewards/accuracy_reward": 0.3392857164144516,
"rewards/cosine_scaled_reward": 0.3391772899776697,
"rewards/format_reward": 1.0,
"step": 266
},
{
"epoch": 1.9925373134328357,
"step": 266,
"total_flos": 0.0,
"train_loss": 0.032786881700187384,
"train_runtime": 16819.5455,
"train_samples_per_second": 1.783,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 268,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}