{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9925373134328357, "eval_steps": 100, "global_step": 266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 593.2823944091797, "epoch": 0.007462686567164179, "grad_norm": 0.5037462115287781, "learning_rate": 3.7037037037037036e-08, "loss": 0.2424, "num_tokens": 667405.0, "reward": 0.18871622439473867, "reward_std": 0.5178131051361561, "rewards/accuracy_reward": 0.13169642724096775, "rewards/cosine_scaled_reward": 0.00010013708379119635, "rewards/format_reward": 0.05691964435391128, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 629.1518173217773, "epoch": 0.014925373134328358, "grad_norm": 0.7031348943710327, "learning_rate": 7.407407407407407e-08, "loss": 0.2401, "num_tokens": 1365053.0, "reward": 0.20235019456595182, "reward_std": 0.5161089487373829, "rewards/accuracy_reward": 0.13616071455180645, "rewards/cosine_scaled_reward": -0.006355166085995734, "rewards/format_reward": 0.0725446434225887, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 579.0893096923828, "epoch": 0.022388059701492536, "grad_norm": 0.6601409316062927, "learning_rate": 1.111111111111111e-07, "loss": 0.2374, "num_tokens": 2014861.0, "reward": 0.2147554385010153, "reward_std": 0.5122785679996014, "rewards/accuracy_reward": 0.14174107275903225, "rewards/cosine_scaled_reward": 0.008282212191261351, "rewards/format_reward": 0.06473214365541935, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 565.4654350280762, "epoch": 0.029850746268656716, "grad_norm": 0.4544272720813751, "learning_rate": 1.4814814814814815e-07, "loss": 0.2732, "num_tokens": 2647950.0, "reward": 0.2221650118008256, "reward_std": 0.5312090590596199, "rewards/accuracy_reward": 0.14062500093132257, "rewards/cosine_scaled_reward": 0.01792393707728479, "rewards/format_reward": 0.06361607159487903, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 537.1294937133789, "epoch": 0.03731343283582089, "grad_norm": 0.4951060116291046, "learning_rate": 1.8518518518518516e-07, "loss": 0.2245, "num_tokens": 3266122.0, "reward": 0.2845571478828788, "reward_std": 0.5756800286471844, "rewards/accuracy_reward": 0.16183035727590322, "rewards/cosine_scaled_reward": 0.05464643065351993, "rewards/format_reward": 0.06808035844005644, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 595.4140930175781, "epoch": 0.04477611940298507, "grad_norm": 0.43293312191963196, "learning_rate": 2.222222222222222e-07, "loss": 0.2237, "num_tokens": 3928245.0, "reward": 0.22057450748980045, "reward_std": 0.5795701257884502, "rewards/accuracy_reward": 0.1395089291036129, "rewards/cosine_scaled_reward": 0.011869142268551514, "rewards/format_reward": 0.06919642933644354, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 557.8248062133789, "epoch": 0.05223880597014925, "grad_norm": 0.650478720664978, "learning_rate": 2.5925925925925923e-07, "loss": 0.2528, "num_tokens": 4550280.0, "reward": 0.23954601865261793, "reward_std": 0.5432833544909954, "rewards/accuracy_reward": 0.1372767877765, "rewards/cosine_scaled_reward": 0.01856386021245271, "rewards/format_reward": 0.08370535750873387, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 560.6506958007812, "epoch": 0.05970149253731343, "grad_norm": 0.46624094247817993, "learning_rate": 2.962962962962963e-07, "loss": 0.2368, "num_tokens": 5196015.0, "reward": 0.23683909513056278, "reward_std": 0.5061681233346462, "rewards/accuracy_reward": 0.15178571362048388, "rewards/cosine_scaled_reward": 0.024785520159639418, "rewards/format_reward": 0.06026785750873387, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 571.0123062133789, "epoch": 0.06716417910447761, "grad_norm": 0.533889889717102, "learning_rate": 3.333333333333333e-07, "loss": 0.2392, "num_tokens": 5835498.0, "reward": 0.23420938570052385, "reward_std": 0.5464016310870647, "rewards/accuracy_reward": 0.13839285681024194, "rewards/cosine_scaled_reward": 0.023271879297681153, "rewards/format_reward": 0.07254464412108064, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 573.4230194091797, "epoch": 0.07462686567164178, "grad_norm": 8.411685943603516, "learning_rate": 3.703703703703703e-07, "loss": 0.2067, "num_tokens": 6479685.0, "reward": 0.28206104040145874, "reward_std": 0.5646266750991344, "rewards/accuracy_reward": 0.16406249906867743, "rewards/cosine_scaled_reward": 0.03429317264817655, "rewards/format_reward": 0.08370535681024194, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 545.127254486084, "epoch": 0.08208955223880597, "grad_norm": 0.9542278051376343, "learning_rate": 4.0740740740740737e-07, "loss": 0.1322, "num_tokens": 7103751.0, "reward": 0.31405315548181534, "reward_std": 0.6051793843507767, "rewards/accuracy_reward": 0.1629464291036129, "rewards/cosine_scaled_reward": 0.04619600536534563, "rewards/format_reward": 0.10491071362048388, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 613.8381958007812, "epoch": 0.08955223880597014, "grad_norm": 0.5434120297431946, "learning_rate": 4.444444444444444e-07, "loss": 0.2048, "num_tokens": 7791062.0, "reward": 0.26040036062477157, "reward_std": 0.5332776308059692, "rewards/accuracy_reward": 0.14174107008147985, "rewards/cosine_scaled_reward": 0.015980710508301854, "rewards/format_reward": 0.10267857136204839, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 515.9442176818848, "epoch": 0.09701492537313433, "grad_norm": 0.5895915031433105, "learning_rate": 4.814814814814814e-07, "loss": 0.1288, "num_tokens": 8388340.0, "reward": 0.40466225892305374, "reward_std": 0.6555211395025253, "rewards/accuracy_reward": 0.1919642873108387, "rewards/cosine_scaled_reward": 0.07988545499392785, "rewards/format_reward": 0.1328124995343387, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 584.5747985839844, "epoch": 0.1044776119402985, "grad_norm": 1.4778566360473633, "learning_rate": 5.185185185185185e-07, "loss": 0.1507, "num_tokens": 9054239.0, "reward": 0.2996965404599905, "reward_std": 0.5809138379991055, "rewards/accuracy_reward": 0.11941964272409678, "rewards/cosine_scaled_reward": -0.00722311669960618, "rewards/format_reward": 0.1875, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 620.6741371154785, "epoch": 0.11194029850746269, "grad_norm": 2.612856149673462, "learning_rate": 5.555555555555555e-07, "loss": 0.2128, "num_tokens": 9741539.0, "reward": 0.34804879780858755, "reward_std": 0.6096060052514076, "rewards/accuracy_reward": 0.14062499906867743, "rewards/cosine_scaled_reward": 0.010995208285748959, "rewards/format_reward": 0.19642856996506453, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 603.7500228881836, "epoch": 0.11940298507462686, "grad_norm": 1.191655158996582, "learning_rate": 5.925925925925926e-07, "loss": 0.1818, "num_tokens": 10413043.0, "reward": 0.3713220842182636, "reward_std": 0.639706090092659, "rewards/accuracy_reward": 0.13950893003493547, "rewards/cosine_scaled_reward": 0.011947068211156875, "rewards/format_reward": 0.2198660708963871, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 544.5558242797852, "epoch": 0.12686567164179105, "grad_norm": 0.6938550472259521, "learning_rate": 6.296296296296296e-07, "loss": 0.085, "num_tokens": 11037421.0, "reward": 0.5226609222590923, "reward_std": 0.7315020114183426, "rewards/accuracy_reward": 0.17410713899880648, "rewards/cosine_scaled_reward": 0.05502696509938687, "rewards/format_reward": 0.2935267835855484, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 571.3337364196777, "epoch": 0.13432835820895522, "grad_norm": 1.5410027503967285, "learning_rate": 6.666666666666666e-07, "loss": 0.1118, "num_tokens": 11676216.0, "reward": 0.591816034168005, "reward_std": 0.7450486496090889, "rewards/accuracy_reward": 0.17633928451687098, "rewards/cosine_scaled_reward": 0.05163742566946894, "rewards/format_reward": 0.3638392873108387, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 549.0669898986816, "epoch": 0.1417910447761194, "grad_norm": 0.5230954885482788, "learning_rate": 7.037037037037037e-07, "loss": 0.0718, "num_tokens": 12316516.0, "reward": 0.7351889088749886, "reward_std": 0.7607561945915222, "rewards/accuracy_reward": 0.1893028812482953, "rewards/cosine_scaled_reward": 0.08228707825765014, "rewards/format_reward": 0.4654017835855484, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 553.9040451049805, "epoch": 0.14925373134328357, "grad_norm": 1.0374072790145874, "learning_rate": 7.407407407407406e-07, "loss": 0.0991, "num_tokens": 12945302.0, "reward": 0.6941376700997353, "reward_std": 0.730622187256813, "rewards/accuracy_reward": 0.13281249720603228, "rewards/cosine_scaled_reward": 0.027843003364978358, "rewards/format_reward": 0.5334821417927742, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 513.1774749755859, "epoch": 0.15671641791044777, "grad_norm": 0.46707218885421753, "learning_rate": 7.777777777777778e-07, "loss": 0.0643, "num_tokens": 13534109.0, "reward": 0.8666251823306084, "reward_std": 0.7560148313641548, "rewards/accuracy_reward": 0.1741071422584355, "rewards/cosine_scaled_reward": 0.07198226451873779, "rewards/format_reward": 0.6205357164144516, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 577.2232398986816, "epoch": 0.16417910447761194, "grad_norm": 0.7016672492027283, "learning_rate": 8.148148148148147e-07, "loss": 0.1127, "num_tokens": 14194037.0, "reward": 0.8435313403606415, "reward_std": 0.7020122557878494, "rewards/accuracy_reward": 0.14174107182770967, "rewards/cosine_scaled_reward": 0.02098666892379697, "rewards/format_reward": 0.6808035746216774, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 502.9531440734863, "epoch": 0.17164179104477612, "grad_norm": 0.35482147336006165, "learning_rate": 8.518518518518518e-07, "loss": 0.1307, "num_tokens": 14768411.0, "reward": 1.1029141992330551, "reward_std": 0.7098172605037689, "rewards/accuracy_reward": 0.22544642724096775, "rewards/cosine_scaled_reward": 0.11742306314408779, "rewards/format_reward": 0.7600446492433548, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 512.8035926818848, "epoch": 0.1791044776119403, "grad_norm": 0.32759323716163635, "learning_rate": 8.888888888888888e-07, "loss": 0.0876, "num_tokens": 15351427.0, "reward": 1.149243749678135, "reward_std": 0.7510530278086662, "rewards/accuracy_reward": 0.21205356903374195, "rewards/cosine_scaled_reward": 0.1124133332632482, "rewards/format_reward": 0.8247767984867096, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 535.4419860839844, "epoch": 0.1865671641791045, "grad_norm": 0.3772229850292206, "learning_rate": 9.259259259259259e-07, "loss": 0.1594, "num_tokens": 15974879.0, "reward": 1.1618424132466316, "reward_std": 0.6563375778496265, "rewards/accuracy_reward": 0.20535714086145163, "rewards/cosine_scaled_reward": 0.09934233513195068, "rewards/format_reward": 0.8571428582072258, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 557.8861923217773, "epoch": 0.19402985074626866, "grad_norm": 0.291864275932312, "learning_rate": 9.629629629629628e-07, "loss": 0.0932, "num_tokens": 16604401.0, "reward": 1.2150916159152985, "reward_std": 0.7205987647175789, "rewards/accuracy_reward": 0.22656249813735485, "rewards/cosine_scaled_reward": 0.12022545811487362, "rewards/format_reward": 0.8683035597205162, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 512.9129600524902, "epoch": 0.20149253731343283, "grad_norm": 0.31357285380363464, "learning_rate": 1e-06, "loss": 0.1004, "num_tokens": 17195475.0, "reward": 1.3348890244960785, "reward_std": 0.6589159071445465, "rewards/accuracy_reward": 0.26897321455180645, "rewards/cosine_scaled_reward": 0.1685942793264985, "rewards/format_reward": 0.8973214253783226, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 532.2611846923828, "epoch": 0.208955223880597, "grad_norm": 0.3403704762458801, "learning_rate": 9.999575185316993e-07, "loss": 0.1619, "num_tokens": 17811437.0, "reward": 1.2805243134498596, "reward_std": 0.6447809338569641, "rewards/accuracy_reward": 0.24441963993012905, "rewards/cosine_scaled_reward": 0.12873857002705336, "rewards/format_reward": 0.9073660746216774, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 506.21988677978516, "epoch": 0.21641791044776118, "grad_norm": 0.35326462984085083, "learning_rate": 9.99830081345498e-07, "loss": 0.1134, "num_tokens": 18408722.0, "reward": 1.336455225944519, "reward_std": 0.6456731334328651, "rewards/accuracy_reward": 0.25000000186264515, "rewards/cosine_scaled_reward": 0.15565160103142262, "rewards/format_reward": 0.9308035746216774, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 503.8482475280762, "epoch": 0.22388059701492538, "grad_norm": 0.26988422870635986, "learning_rate": 9.996177100962712e-07, "loss": 0.0995, "num_tokens": 18986002.0, "reward": 1.4584019258618355, "reward_std": 0.6846916638314724, "rewards/accuracy_reward": 0.315848215483129, "rewards/cosine_scaled_reward": 0.21733042690902948, "rewards/format_reward": 0.9252232164144516, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 471.99778747558594, "epoch": 0.23134328358208955, "grad_norm": 0.3263660669326782, "learning_rate": 9.99320440871389e-07, "loss": 0.1279, "num_tokens": 19548200.0, "reward": 1.4953693896532059, "reward_std": 0.7131579741835594, "rewards/accuracy_reward": 0.32142856903374195, "rewards/cosine_scaled_reward": 0.2308604083955288, "rewards/format_reward": 0.9430803582072258, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 494.79801177978516, "epoch": 0.23880597014925373, "grad_norm": 0.32770803570747375, "learning_rate": 9.989383241845837e-07, "loss": 0.0804, "num_tokens": 20116083.0, "reward": 1.599047303199768, "reward_std": 0.754006952047348, "rewards/accuracy_reward": 0.3761160746216774, "rewards/cosine_scaled_reward": 0.28208293952047825, "rewards/format_reward": 0.9408482015132904, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 531.2064971923828, "epoch": 0.2462686567164179, "grad_norm": 0.27149882912635803, "learning_rate": 9.984714249673673e-07, "loss": 0.1024, "num_tokens": 20746676.0, "reward": 1.6310840100049973, "reward_std": 0.6816431954503059, "rewards/accuracy_reward": 0.3861607164144516, "rewards/cosine_scaled_reward": 0.27952144481241703, "rewards/format_reward": 0.9654017835855484, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 514.7946662902832, "epoch": 0.2537313432835821, "grad_norm": 0.27265796065330505, "learning_rate": 9.979198225579968e-07, "loss": 0.1376, "num_tokens": 21335188.0, "reward": 1.7159467786550522, "reward_std": 0.6568828374147415, "rewards/accuracy_reward": 0.4274553544819355, "rewards/cosine_scaled_reward": 0.3186252359300852, "rewards/format_reward": 0.9698660746216774, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 546.0234527587891, "epoch": 0.26119402985074625, "grad_norm": 0.2513149380683899, "learning_rate": 9.972836106879934e-07, "loss": 0.1169, "num_tokens": 21950753.0, "reward": 1.686128944158554, "reward_std": 0.6522306874394417, "rewards/accuracy_reward": 0.4196428582072258, "rewards/cosine_scaled_reward": 0.316709216684103, "rewards/format_reward": 0.9497767835855484, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 481.2410888671875, "epoch": 0.26865671641791045, "grad_norm": 0.2620702385902405, "learning_rate": 9.965628974662144e-07, "loss": 0.1147, "num_tokens": 22503649.0, "reward": 1.9226552546024323, "reward_std": 0.6497415080666542, "rewards/accuracy_reward": 0.5256696417927742, "rewards/cosine_scaled_reward": 0.4248872734606266, "rewards/format_reward": 0.9720982164144516, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 523.6540451049805, "epoch": 0.27611940298507465, "grad_norm": 0.2901047468185425, "learning_rate": 9.957578053604837e-07, "loss": 0.155, "num_tokens": 23097323.0, "reward": 1.9160521030426025, "reward_std": 0.5634343735873699, "rewards/accuracy_reward": 0.5234375074505806, "rewards/cosine_scaled_reward": 0.4238645453006029, "rewards/format_reward": 0.96875, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 471.8593978881836, "epoch": 0.2835820895522388, "grad_norm": 0.2969004511833191, "learning_rate": 9.948684711767799e-07, "loss": 0.1299, "num_tokens": 23653853.0, "reward": 1.9379696995019913, "reward_std": 0.4608934037387371, "rewards/accuracy_reward": 0.5189732126891613, "rewards/cosine_scaled_reward": 0.434621412307024, "rewards/format_reward": 0.984375, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 532.287971496582, "epoch": 0.291044776119403, "grad_norm": 0.2593567669391632, "learning_rate": 9.938950460359912e-07, "loss": 0.1593, "num_tokens": 24272495.0, "reward": 1.7254538089036942, "reward_std": 0.5551125332713127, "rewards/accuracy_reward": 0.4196428656578064, "rewards/cosine_scaled_reward": 0.32478404976427555, "rewards/format_reward": 0.9810267761349678, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 484.3884086608887, "epoch": 0.29850746268656714, "grad_norm": 0.279813677072525, "learning_rate": 9.928376953482342e-07, "loss": 0.1591, "num_tokens": 24837707.0, "reward": 1.905771628022194, "reward_std": 0.4304558988660574, "rewards/accuracy_reward": 0.5066964253783226, "rewards/cosine_scaled_reward": 0.42139650508761406, "rewards/format_reward": 0.9776785671710968, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 467.0167579650879, "epoch": 0.30597014925373134, "grad_norm": 0.486409068107605, "learning_rate": 9.916965987847484e-07, "loss": 0.1263, "num_tokens": 25387114.0, "reward": 1.8283725529909134, "reward_std": 0.5463476590812206, "rewards/accuracy_reward": 0.4620535746216774, "rewards/cosine_scaled_reward": 0.38417606614530087, "rewards/format_reward": 0.9821428582072258, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 461.70984268188477, "epoch": 0.31343283582089554, "grad_norm": 0.30666935443878174, "learning_rate": 9.904719502473632e-07, "loss": 0.1408, "num_tokens": 25937686.0, "reward": 1.784839078783989, "reward_std": 0.5817533135414124, "rewards/accuracy_reward": 0.4464285708963871, "rewards/cosine_scaled_reward": 0.3629639744758606, "rewards/format_reward": 0.9754464328289032, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 387.7968940734863, "epoch": 0.3208955223880597, "grad_norm": 0.3015042841434479, "learning_rate": 9.89163957835551e-07, "loss": 0.1362, "num_tokens": 26423936.0, "reward": 1.9611081928014755, "reward_std": 0.5745424814522266, "rewards/accuracy_reward": 0.5156250037252903, "rewards/cosine_scaled_reward": 0.45664383843541145, "rewards/format_reward": 0.9888392761349678, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 348.5122871398926, "epoch": 0.3283582089552239, "grad_norm": 0.34659165143966675, "learning_rate": 9.877728438110645e-07, "loss": 0.1396, "num_tokens": 26857179.0, "reward": 1.9145096093416214, "reward_std": 0.5022773817181587, "rewards/accuracy_reward": 0.4955357201397419, "rewards/cosine_scaled_reward": 0.44241129234433174, "rewards/format_reward": 0.9765625, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 332.58372497558594, "epoch": 0.3358208955223881, "grad_norm": 0.43583589792251587, "learning_rate": 9.862988445601687e-07, "loss": 0.169, "num_tokens": 27290358.0, "reward": 1.7086158692836761, "reward_std": 0.429446816444397, "rewards/accuracy_reward": 0.3816964291036129, "rewards/cosine_scaled_reward": 0.3414282575249672, "rewards/format_reward": 0.9854910671710968, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 278.4453239440918, "epoch": 0.34328358208955223, "grad_norm": 0.46366527676582336, "learning_rate": 9.847422105534737e-07, "loss": 0.1147, "num_tokens": 27683117.0, "reward": 1.891074076294899, "reward_std": 0.536506325006485, "rewards/accuracy_reward": 0.4654017798602581, "rewards/cosine_scaled_reward": 0.43236861005425453, "rewards/format_reward": 0.9933035597205162, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 242.423002243042, "epoch": 0.35074626865671643, "grad_norm": 0.5321322679519653, "learning_rate": 9.831032063033724e-07, "loss": 0.113, "num_tokens": 28037664.0, "reward": 1.8316063284873962, "reward_std": 0.5450388044118881, "rewards/accuracy_reward": 0.4308035634458065, "rewards/cosine_scaled_reward": 0.4063830114901066, "rewards/format_reward": 0.9944196417927742, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 193.76898002624512, "epoch": 0.3582089552238806, "grad_norm": 0.5765193104743958, "learning_rate": 9.813821103190931e-07, "loss": 0.1659, "num_tokens": 28342873.0, "reward": 1.669696494936943, "reward_std": 0.4199746139347553, "rewards/accuracy_reward": 0.3482142835855484, "rewards/cosine_scaled_reward": 0.3315267227590084, "rewards/format_reward": 0.9899553507566452, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 158.8895149230957, "epoch": 0.3656716417910448, "grad_norm": 1.0433906316757202, "learning_rate": 9.795792150593738e-07, "loss": 0.135, "num_tokens": 28625046.0, "reward": 1.7185450494289398, "reward_std": 0.4323030523955822, "rewards/accuracy_reward": 0.3727678544819355, "rewards/cosine_scaled_reward": 0.3614020850509405, "rewards/format_reward": 0.9843749925494194, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 140.64509391784668, "epoch": 0.373134328358209, "grad_norm": 2.962268829345703, "learning_rate": 9.776948268827657e-07, "loss": 0.1502, "num_tokens": 28884872.0, "reward": 1.5966612845659256, "reward_std": 0.47611169144511223, "rewards/accuracy_reward": 0.3058035708963871, "rewards/cosine_scaled_reward": 0.29755405336618423, "rewards/format_reward": 0.9933035597205162, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 110.19531726837158, "epoch": 0.3805970149253731, "grad_norm": 1.2034224271774292, "learning_rate": 9.757292659955754e-07, "loss": 0.1468, "num_tokens": 29105703.0, "reward": 1.6452730596065521, "reward_std": 0.4458727203309536, "rewards/accuracy_reward": 0.32924107275903225, "rewards/cosine_scaled_reward": 0.32384440395981073, "rewards/format_reward": 0.9921874925494194, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 88.12277317047119, "epoch": 0.3880597014925373, "grad_norm": 1.8556318283081055, "learning_rate": 9.736828663974526e-07, "loss": 0.1886, "num_tokens": 29322037.0, "reward": 1.5704896599054337, "reward_std": 0.4764312729239464, "rewards/accuracy_reward": 0.29017857275903225, "rewards/cosine_scaled_reward": 0.28700744174420834, "rewards/format_reward": 0.9933035671710968, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 69.49553918838501, "epoch": 0.39552238805970147, "grad_norm": 1.8629873991012573, "learning_rate": 9.715559758246361e-07, "loss": 0.1698, "num_tokens": 29519473.0, "reward": 1.5159788131713867, "reward_std": 0.4448518790304661, "rewards/accuracy_reward": 0.261160715483129, "rewards/cosine_scaled_reward": 0.2581662777811289, "rewards/format_reward": 0.9966517761349678, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 53.76785898208618, "epoch": 0.40298507462686567, "grad_norm": 2.3949267864227295, "learning_rate": 9.69348955690864e-07, "loss": 0.1639, "num_tokens": 29700337.0, "reward": 1.477759376168251, "reward_std": 0.3300577197223902, "rewards/accuracy_reward": 0.2410714291036129, "rewards/cosine_scaled_reward": 0.2400361318141222, "rewards/format_reward": 0.9966517761349678, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 41.593751430511475, "epoch": 0.41044776119402987, "grad_norm": 6.706897258758545, "learning_rate": 9.670621810259594e-07, "loss": 0.107, "num_tokens": 29859917.0, "reward": 1.6255246251821518, "reward_std": 0.3502213731408119, "rewards/accuracy_reward": 0.31473214365541935, "rewards/cosine_scaled_reward": 0.3141406271606684, "rewards/format_reward": 0.9966517761349678, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 36.96540403366089, "epoch": 0.417910447761194, "grad_norm": 3.161987781524658, "learning_rate": 9.64696040412104e-07, "loss": 0.1082, "num_tokens": 30023526.0, "reward": 1.4321366250514984, "reward_std": 0.3272698614746332, "rewards/accuracy_reward": 0.21986606623977423, "rewards/cosine_scaled_reward": 0.21896691620349884, "rewards/format_reward": 0.9933035746216774, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 28.55915331840515, "epoch": 0.4253731343283582, "grad_norm": 2.9937174320220947, "learning_rate": 9.62250935917808e-07, "loss": 0.0479, "num_tokens": 30176771.0, "reward": 1.4093515276908875, "reward_std": 0.22921365313231945, "rewards/accuracy_reward": 0.20535714086145163, "rewards/cosine_scaled_reward": 0.20511038601398468, "rewards/format_reward": 0.9988839253783226, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 26.62834930419922, "epoch": 0.43283582089552236, "grad_norm": 3.9241106510162354, "learning_rate": 9.597272830295876e-07, "loss": 0.035, "num_tokens": 30335774.0, "reward": 1.479707032442093, "reward_std": 0.24030436016619205, "rewards/accuracy_reward": 0.24107142724096775, "rewards/cosine_scaled_reward": 0.24086768366396427, "rewards/format_reward": 0.9977678507566452, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 25.078126192092896, "epoch": 0.44029850746268656, "grad_norm": 3.5471527576446533, "learning_rate": 9.57125510581363e-07, "loss": 0.023, "num_tokens": 30487948.0, "reward": 1.6225911229848862, "reward_std": 0.31288249231874943, "rewards/accuracy_reward": 0.31361607275903225, "rewards/cosine_scaled_reward": 0.31120707653462887, "rewards/format_reward": 0.9977678507566452, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 24.590402603149414, "epoch": 0.44776119402985076, "grad_norm": 2.711862802505493, "learning_rate": 9.5444606068159e-07, "loss": 0.0366, "num_tokens": 30641445.0, "reward": 1.433879777789116, "reward_std": 0.2141956863924861, "rewards/accuracy_reward": 0.21763392444700003, "rewards/cosine_scaled_reward": 0.21624577604234219, "rewards/format_reward": 1.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 23.093750953674316, "epoch": 0.4552238805970149, "grad_norm": 2.6093637943267822, "learning_rate": 9.516893886381321e-07, "loss": 0.0178, "num_tokens": 30803937.0, "reward": 1.3692706674337387, "reward_std": 0.1967663299292326, "rewards/accuracy_reward": 0.1863839286379516, "rewards/cosine_scaled_reward": 0.18623489327728748, "rewards/format_reward": 0.9966517761349678, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 22.19084930419922, "epoch": 0.4626865671641791, "grad_norm": 2.131438732147217, "learning_rate": 9.488559628808938e-07, "loss": 0.0064, "num_tokens": 30953740.0, "reward": 1.422857090830803, "reward_std": 0.20307728182524443, "rewards/accuracy_reward": 0.21205356996506453, "rewards/cosine_scaled_reward": 0.21191950421780348, "rewards/format_reward": 0.9988839253783226, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 22.098215341567993, "epoch": 0.4701492537313433, "grad_norm": 2.0213825702667236, "learning_rate": 9.459462648822207e-07, "loss": 0.0076, "num_tokens": 31112844.0, "reward": 1.4250895529985428, "reward_std": 0.2019376672008093, "rewards/accuracy_reward": 0.2131696455180645, "rewards/cosine_scaled_reward": 0.21303593553602695, "rewards/format_reward": 0.9988839253783226, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 22.00111722946167, "epoch": 0.47761194029850745, "grad_norm": 2.22880220413208, "learning_rate": 9.429607890750862e-07, "loss": 0.0049, "num_tokens": 31274853.0, "reward": 1.5980830639600754, "reward_std": 0.21290546283125877, "rewards/accuracy_reward": 0.29910714365541935, "rewards/cosine_scaled_reward": 0.29897585324943066, "rewards/format_reward": 1.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 21.63169765472412, "epoch": 0.48507462686567165, "grad_norm": 2.368590831756592, "learning_rate": 9.399000427690734e-07, "loss": 0.007, "num_tokens": 31425235.0, "reward": 1.475319281220436, "reward_std": 0.19503124710172415, "rewards/accuracy_reward": 0.2388392835855484, "rewards/cosine_scaled_reward": 0.2375959688797593, "rewards/format_reward": 0.9988839253783226, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 21.521206617355347, "epoch": 0.4925373134328358, "grad_norm": 1.893410325050354, "learning_rate": 9.367645460641714e-07, "loss": 0.0017, "num_tokens": 31578222.0, "reward": 1.4797853082418442, "reward_std": 0.20832678768783808, "rewards/accuracy_reward": 0.23995535541325808, "rewards/cosine_scaled_reward": 0.23982988391071558, "rewards/format_reward": 1.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 21.70647430419922, "epoch": 0.5, "grad_norm": 2.0883381366729736, "learning_rate": 9.335548317623956e-07, "loss": 0.002, "num_tokens": 31726287.0, "reward": 1.573533684015274, "reward_std": 0.2080208584666252, "rewards/accuracy_reward": 0.2868303582072258, "rewards/cosine_scaled_reward": 0.28670324943959713, "rewards/format_reward": 1.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 21.202009916305542, "epoch": 0.5074626865671642, "grad_norm": 2.551394462585449, "learning_rate": 9.302714452772514e-07, "loss": 0.0052, "num_tokens": 31876548.0, "reward": 1.4675123244524002, "reward_std": 0.1835378697142005, "rewards/accuracy_reward": 0.2343750037252903, "rewards/cosine_scaled_reward": 0.23425334133207798, "rewards/format_reward": 0.9988839253783226, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 21.535715103149414, "epoch": 0.5149253731343284, "grad_norm": 2.6030776500701904, "learning_rate": 9.269149445410544e-07, "loss": 0.0046, "num_tokens": 32020812.0, "reward": 1.5433960407972336, "reward_std": 0.20367479603737593, "rewards/accuracy_reward": 0.2723214318975806, "rewards/cosine_scaled_reward": 0.2721905801445246, "rewards/format_reward": 0.9988839253783226, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 20.599331617355347, "epoch": 0.5223880597014925, "grad_norm": 2.546696424484253, "learning_rate": 9.23485899910123e-07, "loss": 0.0003, "num_tokens": 32169237.0, "reward": 1.4831460118293762, "reward_std": 0.17036813125014305, "rewards/accuracy_reward": 0.24029876478016376, "rewards/cosine_scaled_reward": 0.2465388011187315, "rewards/format_reward": 0.9988839253783226, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 20.520090103149414, "epoch": 0.5298507462686567, "grad_norm": 2.1121726036071777, "learning_rate": 9.199848940678605e-07, "loss": 0.0011, "num_tokens": 32314543.0, "reward": 1.5132792741060257, "reward_std": 0.1777313705533743, "rewards/accuracy_reward": 0.25669643096625805, "rewards/cosine_scaled_reward": 0.25658278726041317, "rewards/format_reward": 1.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 21.00334882736206, "epoch": 0.5373134328358209, "grad_norm": 3.4874420166015625, "learning_rate": 9.164125219257417e-07, "loss": 0.0042, "num_tokens": 32464130.0, "reward": 1.4764407873153687, "reward_std": 0.17472796607762575, "rewards/accuracy_reward": 0.23883928544819355, "rewards/cosine_scaled_reward": 0.23871749639511108, "rewards/format_reward": 0.9988839253783226, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 20.36049222946167, "epoch": 0.5447761194029851, "grad_norm": 2.506866931915283, "learning_rate": 9.127693905222223e-07, "loss": -0.0017, "num_tokens": 32614685.0, "reward": 1.508817195892334, "reward_std": 0.1598520427942276, "rewards/accuracy_reward": 0.2555803544819355, "rewards/cosine_scaled_reward": 0.2554689049720764, "rewards/format_reward": 0.9977678507566452, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 20.82366180419922, "epoch": 0.5522388059701493, "grad_norm": 2.803783893585205, "learning_rate": 9.090561189195869e-07, "loss": 0.0032, "num_tokens": 32764399.0, "reward": 1.5980816781520844, "reward_std": 0.1671485211700201, "rewards/accuracy_reward": 0.29910713993012905, "rewards/cosine_scaled_reward": 0.2989744786173105, "rewards/format_reward": 1.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 20.162947416305542, "epoch": 0.5597014925373134, "grad_norm": 2.7349565029144287, "learning_rate": 9.052733380987554e-07, "loss": -0.0034, "num_tokens": 32921881.0, "reward": 1.5400699526071548, "reward_std": 0.15323490625996783, "rewards/accuracy_reward": 0.2700892873108387, "rewards/cosine_scaled_reward": 0.26998060569167137, "rewards/format_reward": 1.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 20.48549175262451, "epoch": 0.5671641791044776, "grad_norm": 2.8505167961120605, "learning_rate": 9.014216908520618e-07, "loss": 0.0031, "num_tokens": 33073868.0, "reward": 1.4675205424427986, "reward_std": 0.18623241062249463, "rewards/accuracy_reward": 0.23437499813735485, "rewards/cosine_scaled_reward": 0.23426154493154172, "rewards/format_reward": 0.9988839253783226, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 20.43861675262451, "epoch": 0.5746268656716418, "grad_norm": 2.716010332107544, "learning_rate": 8.975018316740277e-07, "loss": 0.0014, "num_tokens": 33228253.0, "reward": 1.3793513923883438, "reward_std": 0.13768241831448336, "rewards/accuracy_reward": 0.18973214644938707, "rewards/cosine_scaled_reward": 0.1896191742271185, "rewards/format_reward": 1.0, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 20.793527603149414, "epoch": 0.582089552238806, "grad_norm": 2.40360164642334, "learning_rate": 8.935144266501468e-07, "loss": 0.0032, "num_tokens": 33381900.0, "reward": 1.5244351625442505, "reward_std": 0.11904470889763274, "rewards/accuracy_reward": 0.26227678917348385, "rewards/cosine_scaled_reward": 0.2621582942083478, "rewards/format_reward": 1.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 20.420759677886963, "epoch": 0.5895522388059702, "grad_norm": 1.7719311714172363, "learning_rate": 8.894601533436998e-07, "loss": 0.0, "num_tokens": 33529069.0, "reward": 1.4775669872760773, "reward_std": 0.08695766101230618, "rewards/accuracy_reward": 0.238839291036129, "rewards/cosine_scaled_reward": 0.23872762825340033, "rewards/format_reward": 1.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 21.217634916305542, "epoch": 0.5970149253731343, "grad_norm": 2.443143129348755, "learning_rate": 8.853397006806181e-07, "loss": 0.0005, "num_tokens": 33688720.0, "reward": 1.513253703713417, "reward_std": 0.07582757750845559, "rewards/accuracy_reward": 0.25669642724096775, "rewards/cosine_scaled_reward": 0.25655714236199856, "rewards/format_reward": 1.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 20.36495614051819, "epoch": 0.6044776119402985, "grad_norm": 2.328760862350464, "learning_rate": 8.811537688324187e-07, "loss": 0.0005, "num_tokens": 33841447.0, "reward": 1.5244421511888504, "reward_std": 0.14594503585249186, "rewards/accuracy_reward": 0.2622767873108387, "rewards/cosine_scaled_reward": 0.2621652837842703, "rewards/format_reward": 1.0, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 20.29017925262451, "epoch": 0.6119402985074627, "grad_norm": 2.725046396255493, "learning_rate": 8.769030690972261e-07, "loss": 0.001, "num_tokens": 33983067.0, "reward": 1.6338183134794235, "reward_std": 0.10190565621059022, "rewards/accuracy_reward": 0.31696428544819355, "rewards/cosine_scaled_reward": 0.31685397773981094, "rewards/format_reward": 1.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 20.447545528411865, "epoch": 0.6194029850746269, "grad_norm": 2.60213565826416, "learning_rate": 8.725883237789044e-07, "loss": 0.0015, "num_tokens": 34142804.0, "reward": 1.578012928366661, "reward_std": 0.13737911762410704, "rewards/accuracy_reward": 0.28906249813735485, "rewards/cosine_scaled_reward": 0.2889503873884678, "rewards/format_reward": 1.0, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 20.629465103149414, "epoch": 0.6268656716417911, "grad_norm": 3.175710439682007, "learning_rate": 8.682102660643195e-07, "loss": 0.0005, "num_tokens": 34285752.0, "reward": 1.6293497383594513, "reward_std": 0.1396320709803831, "rewards/accuracy_reward": 0.3147321417927742, "rewards/cosine_scaled_reward": 0.31461753230541945, "rewards/format_reward": 1.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 20.609375715255737, "epoch": 0.6343283582089553, "grad_norm": 1.994175672531128, "learning_rate": 8.637696398987515e-07, "loss": 0.0006, "num_tokens": 34437546.0, "reward": 1.4998853504657745, "reward_std": 0.11160349007695913, "rewards/accuracy_reward": 0.25, "rewards/cosine_scaled_reward": 0.2498853299766779, "rewards/format_reward": 1.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 20.772322177886963, "epoch": 0.6417910447761194, "grad_norm": 2.2707390785217285, "learning_rate": 8.592671998594793e-07, "loss": 0.0014, "num_tokens": 34590166.0, "reward": 1.569079726934433, "reward_std": 0.12746261023711725, "rewards/accuracy_reward": 0.2845982136204839, "rewards/cosine_scaled_reward": 0.2844814406707883, "rewards/format_reward": 1.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 20.42522406578064, "epoch": 0.6492537313432836, "grad_norm": 3.1332757472991943, "learning_rate": 8.547037110275579e-07, "loss": 0.0003, "num_tokens": 34732587.0, "reward": 1.5824772864580154, "reward_std": 0.17397390864789486, "rewards/accuracy_reward": 0.29129463993012905, "rewards/cosine_scaled_reward": 0.2911826092749834, "rewards/format_reward": 1.0, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 20.521206378936768, "epoch": 0.6567164179104478, "grad_norm": 3.331784248352051, "learning_rate": 8.500799488578119e-07, "loss": 0.0018, "num_tokens": 34886118.0, "reward": 1.517743095755577, "reward_std": 0.10318425773594697, "rewards/accuracy_reward": 0.25892857275903225, "rewards/cosine_scaled_reward": 0.2588145062327385, "rewards/format_reward": 1.0, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 20.195313453674316, "epoch": 0.664179104477612, "grad_norm": 2.983870506286621, "learning_rate": 8.453966990470656e-07, "loss": 0.0017, "num_tokens": 35035173.0, "reward": 1.4329265505075455, "reward_std": 0.1654082857307344, "rewards/accuracy_reward": 0.21651785587891936, "rewards/cosine_scaled_reward": 0.21640866296365857, "rewards/format_reward": 1.0, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 20.505581378936768, "epoch": 0.6716417910447762, "grad_norm": 2.7624764442443848, "learning_rate": 8.406547574006324e-07, "loss": 0.0028, "num_tokens": 35170154.0, "reward": 1.6784569025039673, "reward_std": 0.11971815738125713, "rewards/accuracy_reward": 0.33928571827709675, "rewards/cosine_scaled_reward": 0.339171065017581, "rewards/format_reward": 1.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 20.358259677886963, "epoch": 0.6791044776119403, "grad_norm": 3.5564069747924805, "learning_rate": 8.358549296970875e-07, "loss": 0.0004, "num_tokens": 35322683.0, "reward": 1.4887285381555557, "reward_std": 0.10889045795626373, "rewards/accuracy_reward": 0.24441964458674192, "rewards/cosine_scaled_reward": 0.24430878367275, "rewards/format_reward": 1.0, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 20.440849542617798, "epoch": 0.6865671641791045, "grad_norm": 3.2373881340026855, "learning_rate": 8.309980315513442e-07, "loss": 0.0002, "num_tokens": 35471790.0, "reward": 1.7141735553741455, "reward_std": 0.15256303502246737, "rewards/accuracy_reward": 0.3571428544819355, "rewards/cosine_scaled_reward": 0.3570306524634361, "rewards/format_reward": 1.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 20.72433114051819, "epoch": 0.6940298507462687, "grad_norm": 3.330997943878174, "learning_rate": 8.260848882760615e-07, "loss": -0.0002, "num_tokens": 35625503.0, "reward": 1.480910375714302, "reward_std": 0.10137590842316513, "rewards/accuracy_reward": 0.2410714291036129, "rewards/cosine_scaled_reward": 0.24095493368804455, "rewards/format_reward": 0.9988839253783226, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 20.22991156578064, "epoch": 0.7014925373134329, "grad_norm": 2.4767699241638184, "learning_rate": 8.211163347414003e-07, "loss": -0.0, "num_tokens": 35774893.0, "reward": 1.667300522327423, "reward_std": 0.13715945463627577, "rewards/accuracy_reward": 0.3337053582072258, "rewards/cosine_scaled_reward": 0.3335950942710042, "rewards/format_reward": 1.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 20.570313692092896, "epoch": 0.7089552238805971, "grad_norm": 2.710425615310669, "learning_rate": 8.160932152331586e-07, "loss": 0.001, "num_tokens": 35919596.0, "reward": 1.5222073197364807, "reward_std": 0.11986963993861366, "rewards/accuracy_reward": 0.2611607192084193, "rewards/cosine_scaled_reward": 0.2610465129837394, "rewards/format_reward": 1.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 20.17745590209961, "epoch": 0.7164179104477612, "grad_norm": 2.4527230262756348, "learning_rate": 8.110163833093049e-07, "loss": -0.0014, "num_tokens": 36073515.0, "reward": 1.4708732217550278, "reward_std": 0.1094957971945405, "rewards/accuracy_reward": 0.23549107182770967, "rewards/cosine_scaled_reward": 0.2353821201249957, "rewards/format_reward": 1.0, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 20.28459882736206, "epoch": 0.7238805970149254, "grad_norm": 3.3163673877716064, "learning_rate": 8.058867016549371e-07, "loss": -0.0004, "num_tokens": 36224698.0, "reward": 1.5244434028863907, "reward_std": 0.12670162599533796, "rewards/accuracy_reward": 0.26227678544819355, "rewards/cosine_scaled_reward": 0.2621665708720684, "rewards/format_reward": 1.0, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 20.32366132736206, "epoch": 0.7313432835820896, "grad_norm": 2.859325408935547, "learning_rate": 8.007050419356898e-07, "loss": -0.0038, "num_tokens": 36379460.0, "reward": 1.5936386585235596, "reward_std": 0.11910764441277877, "rewards/accuracy_reward": 0.2968750074505806, "rewards/cosine_scaled_reward": 0.2967636212706566, "rewards/format_reward": 1.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 20.453126192092896, "epoch": 0.7388059701492538, "grad_norm": 2.248077154159546, "learning_rate": 7.954722846496149e-07, "loss": 0.003, "num_tokens": 36535562.0, "reward": 1.5244402140378952, "reward_std": 0.08259574370241296, "rewards/accuracy_reward": 0.2622767873108387, "rewards/cosine_scaled_reward": 0.2621633689850569, "rewards/format_reward": 1.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 20.748884916305542, "epoch": 0.746268656716418, "grad_norm": 2.1548383235931396, "learning_rate": 7.901893189775639e-07, "loss": 0.0007, "num_tokens": 36690505.0, "reward": 1.4797910004854202, "reward_std": 0.08582926816011138, "rewards/accuracy_reward": 0.23995536100119352, "rewards/cosine_scaled_reward": 0.23983560875058174, "rewards/format_reward": 1.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 20.24776864051819, "epoch": 0.753731343283582, "grad_norm": 2.7886292934417725, "learning_rate": 7.848570426320916e-07, "loss": 0.0015, "num_tokens": 36832751.0, "reward": 1.6460942327976227, "reward_std": 0.10873846150510502, "rewards/accuracy_reward": 0.32477678917348385, "rewards/cosine_scaled_reward": 0.3246655622497201, "rewards/format_reward": 0.9966517835855484, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 20.510045528411865, "epoch": 0.7611940298507462, "grad_norm": 2.756274700164795, "learning_rate": 7.794763617049123e-07, "loss": 0.0005, "num_tokens": 36983200.0, "reward": 1.7454221993684769, "reward_std": 0.10062572493555422, "rewards/accuracy_reward": 0.3727678582072258, "rewards/cosine_scaled_reward": 0.372654240578413, "rewards/format_reward": 1.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 20.693081378936768, "epoch": 0.7686567164179104, "grad_norm": 1.6583493947982788, "learning_rate": 7.740481905129306e-07, "loss": 0.0009, "num_tokens": 37135357.0, "reward": 1.5065791308879852, "reward_std": 0.045692659896090504, "rewards/accuracy_reward": 0.2533482136204839, "rewards/cosine_scaled_reward": 0.2532308688387275, "rewards/format_reward": 1.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 20.45424199104309, "epoch": 0.7761194029850746, "grad_norm": 3.4718310832977295, "learning_rate": 7.685734514428766e-07, "loss": 0.0004, "num_tokens": 37286476.0, "reward": 1.4396189004182816, "reward_std": 0.07011978597014945, "rewards/accuracy_reward": 0.21986607369035482, "rewards/cosine_scaled_reward": 0.2197527764365077, "rewards/format_reward": 1.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 20.142858266830444, "epoch": 0.7835820895522388, "grad_norm": 3.465782642364502, "learning_rate": 7.630530747945672e-07, "loss": -0.0001, "num_tokens": 37435684.0, "reward": 1.4943107217550278, "reward_std": 0.08559985571561413, "rewards/accuracy_reward": 0.2477678619325161, "rewards/cosine_scaled_reward": 0.2476588897407055, "rewards/format_reward": 0.9988839253783226, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 20.049107789993286, "epoch": 0.7910447761194029, "grad_norm": 2.42425274848938, "learning_rate": 7.574879986228244e-07, "loss": -0.0001, "num_tokens": 37582544.0, "reward": 1.5132858008146286, "reward_std": 0.08311572534432088, "rewards/accuracy_reward": 0.2566964318975806, "rewards/cosine_scaled_reward": 0.25658932141959667, "rewards/format_reward": 1.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 20.716518878936768, "epoch": 0.7985074626865671, "grad_norm": 2.004718065261841, "learning_rate": 7.518791685780768e-07, "loss": 0.0029, "num_tokens": 37739602.0, "reward": 1.6248817294836044, "reward_std": 0.07741166379830844, "rewards/accuracy_reward": 0.3125, "rewards/cosine_scaled_reward": 0.3123816456645727, "rewards/format_reward": 1.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 20.168527364730835, "epoch": 0.8059701492537313, "grad_norm": 3.211866855621338, "learning_rate": 7.462275377456669e-07, "loss": 0.0001, "num_tokens": 37891401.0, "reward": 1.5891772359609604, "reward_std": 0.09964832732346451, "rewards/accuracy_reward": 0.2946428610011935, "rewards/cosine_scaled_reward": 0.2945342995226383, "rewards/format_reward": 1.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 20.299107789993286, "epoch": 0.8134328358208955, "grad_norm": 2.558931589126587, "learning_rate": 7.405340664838993e-07, "loss": 0.0004, "num_tokens": 38042885.0, "reward": 1.4619427621364594, "reward_std": 0.11446394885876998, "rewards/accuracy_reward": 0.23102678544819355, "rewards/cosine_scaled_reward": 0.23091593850404024, "rewards/format_reward": 1.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 20.55022406578064, "epoch": 0.8208955223880597, "grad_norm": 2.110745906829834, "learning_rate": 7.347997222608492e-07, "loss": -0.0002, "num_tokens": 38197242.0, "reward": 1.5735462754964828, "reward_std": 0.0649347297767946, "rewards/accuracy_reward": 0.2868303544819355, "rewards/cosine_scaled_reward": 0.2867158204317093, "rewards/format_reward": 1.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 20.551340103149414, "epoch": 0.8283582089552238, "grad_norm": 1.8552799224853516, "learning_rate": 7.290254794899664e-07, "loss": 0.002, "num_tokens": 38340640.0, "reward": 1.5255557298660278, "reward_std": 0.06177972303260404, "rewards/accuracy_reward": 0.26339286006987095, "rewards/cosine_scaled_reward": 0.26327887177467346, "rewards/format_reward": 0.9988839253783226, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 20.752233266830444, "epoch": 0.835820895522388, "grad_norm": 2.4142839908599854, "learning_rate": 7.232123193644956e-07, "loss": 0.0006, "num_tokens": 38500346.0, "reward": 1.51997210085392, "reward_std": 0.07079227790242726, "rewards/accuracy_reward": 0.2600446445867419, "rewards/cosine_scaled_reward": 0.25992743112146854, "rewards/format_reward": 1.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 20.882813215255737, "epoch": 0.8432835820895522, "grad_norm": 2.0007951259613037, "learning_rate": 7.173612296907472e-07, "loss": 0.0001, "num_tokens": 38658729.0, "reward": 1.6516644805669785, "reward_std": 0.10580981522798538, "rewards/accuracy_reward": 0.3258928544819355, "rewards/cosine_scaled_reward": 0.3257715832442045, "rewards/format_reward": 1.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 20.75558114051819, "epoch": 0.8507462686567164, "grad_norm": 2.6902952194213867, "learning_rate": 7.114732047202432e-07, "loss": -0.0002, "num_tokens": 38805462.0, "reward": 1.6114880591630936, "reward_std": 0.09198851990785784, "rewards/accuracy_reward": 0.3058035708963871, "rewards/cosine_scaled_reward": 0.3056844547390938, "rewards/format_reward": 1.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 20.13616156578064, "epoch": 0.8582089552238806, "grad_norm": 1.9690958261489868, "learning_rate": 7.055492449807683e-07, "loss": 0.0005, "num_tokens": 38959272.0, "reward": 1.5489989072084427, "reward_std": 0.08409377404399265, "rewards/accuracy_reward": 0.27455357648432255, "rewards/cosine_scaled_reward": 0.27444526366889477, "rewards/format_reward": 1.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 20.392857789993286, "epoch": 0.8656716417910447, "grad_norm": 1.6137839555740356, "learning_rate": 6.99590357106354e-07, "loss": 0.0002, "num_tokens": 39103456.0, "reward": 1.5356025993824005, "reward_std": 0.06981676115469782, "rewards/accuracy_reward": 0.2678571445867419, "rewards/cosine_scaled_reward": 0.2677453998476267, "rewards/format_reward": 1.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 20.395090103149414, "epoch": 0.8731343283582089, "grad_norm": 3.123340129852295, "learning_rate": 6.935975536662253e-07, "loss": 0.0011, "num_tokens": 39248058.0, "reward": 1.589174211025238, "reward_std": 0.12407711929557763, "rewards/accuracy_reward": 0.29464285634458065, "rewards/cosine_scaled_reward": 0.29453128995373845, "rewards/format_reward": 1.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 20.720983028411865, "epoch": 0.8805970149253731, "grad_norm": 2.0955867767333984, "learning_rate": 6.875718529927404e-07, "loss": 0.0006, "num_tokens": 39410136.0, "reward": 1.5802399963140488, "reward_std": 0.0863498275235628, "rewards/accuracy_reward": 0.2901785708963871, "rewards/cosine_scaled_reward": 0.2900614067912102, "rewards/format_reward": 1.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 20.71651864051819, "epoch": 0.8880597014925373, "grad_norm": 3.44924259185791, "learning_rate": 6.815142790083473e-07, "loss": 0.0025, "num_tokens": 39569234.0, "reward": 1.7007768154144287, "reward_std": 0.14646728224154515, "rewards/accuracy_reward": 0.3504464291036129, "rewards/cosine_scaled_reward": 0.35033031180500984, "rewards/format_reward": 1.0, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 20.483259916305542, "epoch": 0.8955223880597015, "grad_norm": 2.0056583881378174, "learning_rate": 6.754258610515948e-07, "loss": 0.0015, "num_tokens": 39727235.0, "reward": 1.4820292592048645, "reward_std": 0.07951601898218996, "rewards/accuracy_reward": 0.24107143096625805, "rewards/cosine_scaled_reward": 0.24095771089196205, "rewards/format_reward": 1.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 20.08147430419922, "epoch": 0.9029850746268657, "grad_norm": 2.79129958152771, "learning_rate": 6.69307633702221e-07, "loss": 0.0001, "num_tokens": 39874236.0, "reward": 1.5199817568063736, "reward_std": 0.08146786268110873, "rewards/accuracy_reward": 0.2600446455180645, "rewards/cosine_scaled_reward": 0.2599370051175356, "rewards/format_reward": 1.0, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 20.085938453674316, "epoch": 0.9104477611940298, "grad_norm": 3.3468284606933594, "learning_rate": 6.631606366053506e-07, "loss": -0.0003, "num_tokens": 40015729.0, "reward": 1.7141781598329544, "reward_std": 0.10137949584012773, "rewards/accuracy_reward": 0.3571428582072258, "rewards/cosine_scaled_reward": 0.35703518986701965, "rewards/format_reward": 1.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 20.622768878936768, "epoch": 0.917910447761194, "grad_norm": 2.6946330070495605, "learning_rate": 6.569859142948327e-07, "loss": 0.0021, "num_tokens": 40163039.0, "reward": 1.5467600226402283, "reward_std": 0.11038771457970142, "rewards/accuracy_reward": 0.27343749813735485, "rewards/cosine_scaled_reward": 0.2733224518597126, "rewards/format_reward": 1.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 20.418527841567993, "epoch": 0.9253731343283582, "grad_norm": 2.7771787643432617, "learning_rate": 6.507845160157475e-07, "loss": 0.0004, "num_tokens": 40317998.0, "reward": 1.6159598380327225, "reward_std": 0.07643294239515797, "rewards/accuracy_reward": 0.30803571827709675, "rewards/cosine_scaled_reward": 0.30792406760156155, "rewards/format_reward": 1.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 20.400670289993286, "epoch": 0.9328358208955224, "grad_norm": 1.899020791053772, "learning_rate": 6.445574955461133e-07, "loss": -0.0025, "num_tokens": 40465605.0, "reward": 1.4998881071805954, "reward_std": 0.10077201342210174, "rewards/accuracy_reward": 0.2500000037252903, "rewards/cosine_scaled_reward": 0.2498880298808217, "rewards/format_reward": 1.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 20.99330425262451, "epoch": 0.9402985074626866, "grad_norm": 1.1476165056228638, "learning_rate": 6.383059110178203e-07, "loss": 0.0009, "num_tokens": 40617031.0, "reward": 1.6114817261695862, "reward_std": 0.0364510658172037, "rewards/accuracy_reward": 0.30580357275903225, "rewards/cosine_scaled_reward": 0.3056781152263284, "rewards/format_reward": 1.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 20.44642949104309, "epoch": 0.9477611940298507, "grad_norm": 2.6259467601776123, "learning_rate": 6.320308247368284e-07, "loss": 0.0001, "num_tokens": 40772791.0, "reward": 1.5289026349782944, "reward_std": 0.13789613312110305, "rewards/accuracy_reward": 0.2645089328289032, "rewards/cosine_scaled_reward": 0.2643936015665531, "rewards/format_reward": 1.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 20.164063453674316, "epoch": 0.9552238805970149, "grad_norm": 1.8769416809082031, "learning_rate": 6.257333030026538e-07, "loss": -0.0007, "num_tokens": 40921778.0, "reward": 1.5445343106985092, "reward_std": 0.07823521745721607, "rewards/accuracy_reward": 0.2723214318975806, "rewards/cosine_scaled_reward": 0.2722127726301551, "rewards/format_reward": 1.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 20.060268878936768, "epoch": 0.9626865671641791, "grad_norm": 3.188025951385498, "learning_rate": 6.194144159271755e-07, "loss": 0.0006, "num_tokens": 41068368.0, "reward": 1.4440890699625015, "reward_std": 0.07432721156590105, "rewards/accuracy_reward": 0.22209821362048388, "rewards/cosine_scaled_reward": 0.22199082095175982, "rewards/format_reward": 1.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 20.361608028411865, "epoch": 0.9701492537313433, "grad_norm": 1.6484222412109375, "learning_rate": 6.130752372527981e-07, "loss": 0.0019, "num_tokens": 41215084.0, "reward": 1.4931926876306534, "reward_std": 0.051030852994102816, "rewards/accuracy_reward": 0.246651791036129, "rewards/cosine_scaled_reward": 0.24654078483581543, "rewards/format_reward": 1.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 20.842634677886963, "epoch": 0.9776119402985075, "grad_norm": 3.736074447631836, "learning_rate": 6.067168441699927e-07, "loss": 0.0003, "num_tokens": 41368055.0, "reward": 1.6940742880105972, "reward_std": 0.12490348052233458, "rewards/accuracy_reward": 0.3470982164144516, "rewards/cosine_scaled_reward": 0.3469760064035654, "rewards/format_reward": 1.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 20.293527603149414, "epoch": 0.9850746268656716, "grad_norm": 11.65377140045166, "learning_rate": 6.003403171342562e-07, "loss": -0.0001, "num_tokens": 41521014.0, "reward": 1.388282224535942, "reward_std": 0.06252689357782515, "rewards/accuracy_reward": 0.19419642724096775, "rewards/cosine_scaled_reward": 0.19408572791144252, "rewards/format_reward": 1.0, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 20.136138439178467, "epoch": 0.9925373134328358, "grad_norm": 3.632760763168335, "learning_rate": 5.939467396825136e-07, "loss": 0.0005, "num_tokens": 41671398.0, "reward": 1.4574763923883438, "reward_std": 0.08762641241588653, "rewards/accuracy_reward": 0.2287946417927742, "rewards/cosine_scaled_reward": 0.22868163883686066, "rewards/format_reward": 1.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 20.29799175262451, "epoch": 1.007462686567164, "grad_norm": 5.531825542449951, "learning_rate": 5.875371982489958e-07, "loss": 0.0008, "num_tokens": 41808033.0, "reward": 1.5132822692394257, "reward_std": 0.07973260305406171, "rewards/accuracy_reward": 0.2566964328289032, "rewards/cosine_scaled_reward": 0.2565857656300068, "rewards/format_reward": 1.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 21.155134916305542, "epoch": 1.0149253731343284, "grad_norm": 4.794013023376465, "learning_rate": 5.811127819806276e-07, "loss": -0.0001, "num_tokens": 41952708.0, "reward": 1.6561159640550613, "reward_std": 0.10551196637798199, "rewards/accuracy_reward": 0.32812500186264515, "rewards/cosine_scaled_reward": 0.32799087278544903, "rewards/format_reward": 1.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 20.325893878936768, "epoch": 1.0223880597014925, "grad_norm": 4.288095474243164, "learning_rate": 5.746745825519538e-07, "loss": -0.0005, "num_tokens": 42096168.0, "reward": 1.6539065688848495, "reward_std": 0.06463327458860135, "rewards/accuracy_reward": 0.32700893096625805, "rewards/cosine_scaled_reward": 0.3268975578248501, "rewards/format_reward": 1.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 20.46428680419922, "epoch": 1.0298507462686568, "grad_norm": 2.075472593307495, "learning_rate": 5.682236939796336e-07, "loss": 0.0006, "num_tokens": 42245792.0, "reward": 1.682920902967453, "reward_std": 0.11724273651551442, "rewards/accuracy_reward": 0.34151786006987095, "rewards/cosine_scaled_reward": 0.34140297770500183, "rewards/format_reward": 1.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 20.25334930419922, "epoch": 1.037313432835821, "grad_norm": 3.787569046020508, "learning_rate": 5.61761212436541e-07, "loss": 0.0007, "num_tokens": 42387403.0, "reward": 1.5601582527160645, "reward_std": 0.09724155126728817, "rewards/accuracy_reward": 0.2801339253783226, "rewards/cosine_scaled_reward": 0.28002420626580715, "rewards/format_reward": 1.0, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 20.424108266830444, "epoch": 1.044776119402985, "grad_norm": 2.1570165157318115, "learning_rate": 5.552882360654949e-07, "loss": 0.0008, "num_tokens": 42536287.0, "reward": 1.627119928598404, "reward_std": 0.10236127915219129, "rewards/accuracy_reward": 0.313616075553, "rewards/cosine_scaled_reward": 0.3135037589818239, "rewards/format_reward": 1.0, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 20.625000953674316, "epoch": 1.0522388059701493, "grad_norm": 3.6223268508911133, "learning_rate": 5.488058647926577e-07, "loss": 0.001, "num_tokens": 42696151.0, "reward": 1.535599023103714, "reward_std": 0.09071048016893712, "rewards/accuracy_reward": 0.2654533013701439, "rewards/cosine_scaled_reward": 0.27220611833035946, "rewards/format_reward": 1.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 20.46428632736206, "epoch": 1.0597014925373134, "grad_norm": 3.7596476078033447, "learning_rate": 5.423152001406282e-07, "loss": -0.0003, "num_tokens": 42855279.0, "reward": 1.6070294231176376, "reward_std": 0.12858991045504808, "rewards/accuracy_reward": 0.30357142724096775, "rewards/cosine_scaled_reward": 0.3034578887745738, "rewards/format_reward": 1.0, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 20.022322177886963, "epoch": 1.0671641791044777, "grad_norm": 3.700883150100708, "learning_rate": 5.358173450412648e-07, "loss": -0.0001, "num_tokens": 43008275.0, "reward": 1.5623932778835297, "reward_std": 0.1167864422913425, "rewards/accuracy_reward": 0.2812500009313226, "rewards/cosine_scaled_reward": 0.28114316891878843, "rewards/format_reward": 1.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 20.39955425262451, "epoch": 1.0746268656716418, "grad_norm": 4.979684829711914, "learning_rate": 5.293134036482698e-07, "loss": 0.0, "num_tokens": 43159225.0, "reward": 1.5132808834314346, "reward_std": 0.09190170587856983, "rewards/accuracy_reward": 0.2566964328289032, "rewards/cosine_scaled_reward": 0.25658440589904785, "rewards/format_reward": 1.0, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 20.123884677886963, "epoch": 1.0820895522388059, "grad_norm": 2.860994338989258, "learning_rate": 5.228044811495631e-07, "loss": -0.0003, "num_tokens": 43309240.0, "reward": 1.582481175661087, "reward_std": 0.0821403276665933, "rewards/accuracy_reward": 0.2912946417927742, "rewards/cosine_scaled_reward": 0.29118647053837776, "rewards/format_reward": 1.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 20.095982789993286, "epoch": 1.0895522388059702, "grad_norm": 3.0046682357788086, "learning_rate": 5.162916835794843e-07, "loss": 0.0002, "num_tokens": 43458758.0, "reward": 1.5556955337524414, "reward_std": 0.07936285677858734, "rewards/accuracy_reward": 0.2779017873108387, "rewards/cosine_scaled_reward": 0.27779373340308666, "rewards/format_reward": 1.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 20.234375476837158, "epoch": 1.0970149253731343, "grad_norm": 4.488862991333008, "learning_rate": 5.09776117630847e-07, "loss": 0.0004, "num_tokens": 43608960.0, "reward": 1.6360509097576141, "reward_std": 0.07161617746324822, "rewards/accuracy_reward": 0.3180803544819355, "rewards/cosine_scaled_reward": 0.31797049194574356, "rewards/format_reward": 1.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 20.46651864051819, "epoch": 1.1044776119402986, "grad_norm": 1.4745782613754272, "learning_rate": 5.032588904668851e-07, "loss": -0.0036, "num_tokens": 43754050.0, "reward": 1.7454189360141754, "reward_std": 0.05425459118813336, "rewards/accuracy_reward": 0.37276786379516125, "rewards/cosine_scaled_reward": 0.37265095487236977, "rewards/format_reward": 1.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 20.15736722946167, "epoch": 1.1119402985074627, "grad_norm": 4.071691513061523, "learning_rate": 4.967411095331149e-07, "loss": 0.0004, "num_tokens": 43899775.0, "reward": 1.6427484452724457, "reward_std": 0.09672015977940873, "rewards/accuracy_reward": 0.32142856903374195, "rewards/cosine_scaled_reward": 0.32131983526051044, "rewards/format_reward": 1.0, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 20.150670528411865, "epoch": 1.1194029850746268, "grad_norm": 5.170862197875977, "learning_rate": 4.90223882369153e-07, "loss": 0.001, "num_tokens": 44050078.0, "reward": 1.629356011748314, "reward_std": 0.12038855330866483, "rewards/accuracy_reward": 0.3147321455180645, "rewards/cosine_scaled_reward": 0.3146238140761852, "rewards/format_reward": 1.0, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 20.264509439468384, "epoch": 1.126865671641791, "grad_norm": 2.8727357387542725, "learning_rate": 4.837083164205159e-07, "loss": 0.0013, "num_tokens": 44198707.0, "reward": 1.5356042981147766, "reward_std": 0.13752735047977538, "rewards/accuracy_reward": 0.2678571455180645, "rewards/cosine_scaled_reward": 0.26774709299206734, "rewards/format_reward": 1.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 20.49330425262451, "epoch": 1.1343283582089552, "grad_norm": 2.9410152435302734, "learning_rate": 4.77195518850437e-07, "loss": -0.0017, "num_tokens": 44350965.0, "reward": 1.5467612594366074, "reward_std": 0.08439550402343343, "rewards/accuracy_reward": 0.27343750186264515, "rewards/cosine_scaled_reward": 0.2733236690983176, "rewards/format_reward": 1.0, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 20.28459882736206, "epoch": 1.1417910447761195, "grad_norm": 5.266554832458496, "learning_rate": 4.7068659635173025e-07, "loss": -0.0006, "num_tokens": 44496740.0, "reward": 1.571318507194519, "reward_std": 0.09055654217311826, "rewards/accuracy_reward": 0.2857142873108387, "rewards/cosine_scaled_reward": 0.2856041230261326, "rewards/format_reward": 1.0, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 20.398438453674316, "epoch": 1.1492537313432836, "grad_norm": 2.222485303878784, "learning_rate": 4.6418265495873516e-07, "loss": -0.0004, "num_tokens": 44656433.0, "reward": 1.667298749089241, "reward_std": 0.10092825663519989, "rewards/accuracy_reward": 0.3337053582072258, "rewards/cosine_scaled_reward": 0.3335932996124029, "rewards/format_reward": 1.0, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 20.23995590209961, "epoch": 1.1567164179104479, "grad_norm": 2.372213840484619, "learning_rate": 4.5768479985937194e-07, "loss": 0.002, "num_tokens": 44807600.0, "reward": 1.624890297651291, "reward_std": 0.13548944082560865, "rewards/accuracy_reward": 0.31249999813735485, "rewards/cosine_scaled_reward": 0.31239020079374313, "rewards/format_reward": 1.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 20.25558114051819, "epoch": 1.164179104477612, "grad_norm": 1.6955794095993042, "learning_rate": 4.511941352073424e-07, "loss": 0.0005, "num_tokens": 44957893.0, "reward": 1.5780149698257446, "reward_std": 0.07597658339989977, "rewards/accuracy_reward": 0.28906250558793545, "rewards/cosine_scaled_reward": 0.2889523971825838, "rewards/format_reward": 1.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 20.380581378936768, "epoch": 1.171641791044776, "grad_norm": 2.024655342102051, "learning_rate": 4.4471176393450515e-07, "loss": 0.001, "num_tokens": 45113066.0, "reward": 1.4954225569963455, "reward_std": 0.09183560762491538, "rewards/accuracy_reward": 0.2477678582072258, "rewards/cosine_scaled_reward": 0.24765462800860405, "rewards/format_reward": 1.0, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 19.982143878936768, "epoch": 1.1791044776119404, "grad_norm": 3.3944594860076904, "learning_rate": 4.382387875634591e-07, "loss": 0.0004, "num_tokens": 45261530.0, "reward": 1.6472150832414627, "reward_std": 0.10724194766953588, "rewards/accuracy_reward": 0.32366071455180645, "rewards/cosine_scaled_reward": 0.32355429045856, "rewards/format_reward": 1.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 20.106027603149414, "epoch": 1.1865671641791045, "grad_norm": 1.5790033340454102, "learning_rate": 4.317763060203664e-07, "loss": 0.0002, "num_tokens": 45416145.0, "reward": 1.4440883994102478, "reward_std": 0.06350326852842159, "rewards/accuracy_reward": 0.22209821734577417, "rewards/cosine_scaled_reward": 0.2219901392236352, "rewards/format_reward": 1.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 20.33705449104309, "epoch": 1.1940298507462686, "grad_norm": 1.6651524305343628, "learning_rate": 4.253254174480462e-07, "loss": -0.0, "num_tokens": 45560303.0, "reward": 1.5333705618977547, "reward_std": 0.040959979950784486, "rewards/accuracy_reward": 0.266741075553, "rewards/cosine_scaled_reward": 0.2666294459568235, "rewards/format_reward": 1.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 20.408483028411865, "epoch": 1.2014925373134329, "grad_norm": 3.0133707523345947, "learning_rate": 4.1888721801937226e-07, "loss": 0.001, "num_tokens": 45701981.0, "reward": 1.5891735255718231, "reward_std": 0.08890740286335586, "rewards/accuracy_reward": 0.2946428610011935, "rewards/cosine_scaled_reward": 0.2945305937901139, "rewards/format_reward": 1.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 20.158483266830444, "epoch": 1.208955223880597, "grad_norm": 2.2523109912872314, "learning_rate": 4.124628017510042e-07, "loss": -0.0028, "num_tokens": 45851651.0, "reward": 1.5891766995191574, "reward_std": 0.08664929727092385, "rewards/accuracy_reward": 0.2946428647264838, "rewards/cosine_scaled_reward": 0.2945337858982384, "rewards/format_reward": 1.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 20.29017925262451, "epoch": 1.2164179104477613, "grad_norm": 1.8585433959960938, "learning_rate": 4.0605326031748646e-07, "loss": -0.0001, "num_tokens": 45994527.0, "reward": 1.5445328205823898, "reward_std": 0.05591056302149866, "rewards/accuracy_reward": 0.27232143096625805, "rewards/cosine_scaled_reward": 0.27221135422587395, "rewards/format_reward": 1.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 20.445313453674316, "epoch": 1.2238805970149254, "grad_norm": 5.333817481994629, "learning_rate": 3.9965968286574367e-07, "loss": 0.0007, "num_tokens": 46140630.0, "reward": 1.4463159441947937, "reward_std": 0.08372260938289244, "rewards/accuracy_reward": 0.22321428637951612, "rewards/cosine_scaled_reward": 0.2231016056612134, "rewards/format_reward": 1.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 20.148438215255737, "epoch": 1.2313432835820897, "grad_norm": 2.544990301132202, "learning_rate": 3.9328315583000737e-07, "loss": 0.0002, "num_tokens": 46290443.0, "reward": 1.636052206158638, "reward_std": 0.09034244809299707, "rewards/accuracy_reward": 0.3180803582072258, "rewards/cosine_scaled_reward": 0.31797176599502563, "rewards/format_reward": 1.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 20.42299175262451, "epoch": 1.2388059701492538, "grad_norm": 4.120884895324707, "learning_rate": 3.869247627472021e-07, "loss": 0.0018, "num_tokens": 46439446.0, "reward": 1.653905838727951, "reward_std": 0.10355404989928729, "rewards/accuracy_reward": 0.3270089291036129, "rewards/cosine_scaled_reward": 0.32689686864614487, "rewards/format_reward": 1.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 20.90736722946167, "epoch": 1.2462686567164178, "grad_norm": 6.220430374145508, "learning_rate": 3.805855840728246e-07, "loss": -0.0009, "num_tokens": 46583603.0, "reward": 1.5690757930278778, "reward_std": 0.1022080342995082, "rewards/accuracy_reward": 0.2845982126891613, "rewards/cosine_scaled_reward": 0.2844774592667818, "rewards/format_reward": 1.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 20.39955425262451, "epoch": 1.2537313432835822, "grad_norm": 3.827559232711792, "learning_rate": 3.7426669699734626e-07, "loss": 0.0, "num_tokens": 46729329.0, "reward": 1.5199765115976334, "reward_std": 0.07372096207812717, "rewards/accuracy_reward": 0.26004464365541935, "rewards/cosine_scaled_reward": 0.2599317729473114, "rewards/format_reward": 1.0, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 20.420759439468384, "epoch": 1.2611940298507462, "grad_norm": 2.8399739265441895, "learning_rate": 3.679691752631715e-07, "loss": 0.0012, "num_tokens": 46881738.0, "reward": 1.6873881220817566, "reward_std": 0.028184092890906953, "rewards/accuracy_reward": 0.34375000558793545, "rewards/cosine_scaled_reward": 0.3436380457133055, "rewards/format_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 20.460938453674316, "epoch": 1.2686567164179103, "grad_norm": 3.9981281757354736, "learning_rate": 3.6169408898217966e-07, "loss": -0.0007, "num_tokens": 47032783.0, "reward": 1.5735474079847336, "reward_std": 0.09409430988146994, "rewards/accuracy_reward": 0.28683035261929035, "rewards/cosine_scaled_reward": 0.28671699203550816, "rewards/format_reward": 1.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 20.434152603149414, "epoch": 1.2761194029850746, "grad_norm": 4.026747226715088, "learning_rate": 3.554425044538867e-07, "loss": -0.0003, "num_tokens": 47186252.0, "reward": 1.6293520778417587, "reward_std": 0.06463150960456687, "rewards/accuracy_reward": 0.31473214738070965, "rewards/cosine_scaled_reward": 0.31461989507079124, "rewards/format_reward": 1.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 20.639509677886963, "epoch": 1.2835820895522387, "grad_norm": 2.835671901702881, "learning_rate": 3.492154839842524e-07, "loss": -0.001, "num_tokens": 47337545.0, "reward": 1.6829200685024261, "reward_std": 0.11416030763536611, "rewards/accuracy_reward": 0.34151785261929035, "rewards/cosine_scaled_reward": 0.3414021451026201, "rewards/format_reward": 1.0, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 20.23549175262451, "epoch": 1.291044776119403, "grad_norm": 2.3982043266296387, "learning_rate": 3.430140857051674e-07, "loss": 0.0006, "num_tokens": 47485516.0, "reward": 1.7543545216321945, "reward_std": 0.08049214289215456, "rewards/accuracy_reward": 0.3772321417927742, "rewards/cosine_scaled_reward": 0.37712232768535614, "rewards/format_reward": 1.0, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 20.290179014205933, "epoch": 1.2985074626865671, "grad_norm": 2.3194615840911865, "learning_rate": 3.3683936339464955e-07, "loss": -0.0009, "num_tokens": 47626200.0, "reward": 1.5512276887893677, "reward_std": 0.05493424205126729, "rewards/accuracy_reward": 0.2756696483120322, "rewards/cosine_scaled_reward": 0.2755579724907875, "rewards/format_reward": 1.0, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 20.627233028411865, "epoch": 1.3059701492537314, "grad_norm": 1.7407325506210327, "learning_rate": 3.3069236629777884e-07, "loss": 0.0019, "num_tokens": 47773482.0, "reward": 1.7253312766551971, "reward_std": 0.04794870165083864, "rewards/accuracy_reward": 0.3627232201397419, "rewards/cosine_scaled_reward": 0.36260795034468174, "rewards/format_reward": 1.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 20.26339364051819, "epoch": 1.3134328358208955, "grad_norm": 1.944441556930542, "learning_rate": 3.2457413894840514e-07, "loss": 0.0005, "num_tokens": 47925630.0, "reward": 1.542300522327423, "reward_std": 0.04922672476845946, "rewards/accuracy_reward": 0.2712053554132581, "rewards/cosine_scaled_reward": 0.2710951156914234, "rewards/format_reward": 1.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 20.04799175262451, "epoch": 1.3208955223880596, "grad_norm": 2.3487837314605713, "learning_rate": 3.184857209916528e-07, "loss": 0.0009, "num_tokens": 48071681.0, "reward": 1.5333750247955322, "reward_std": 0.08439638444930608, "rewards/accuracy_reward": 0.26674107275903225, "rewards/cosine_scaled_reward": 0.2666339073330164, "rewards/format_reward": 1.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 20.16294765472412, "epoch": 1.328358208955224, "grad_norm": 2.5152955055236816, "learning_rate": 3.124281470072597e-07, "loss": 0.0007, "num_tokens": 48219603.0, "reward": 1.6070343852043152, "reward_std": 0.07860209648621108, "rewards/accuracy_reward": 0.3035714291036129, "rewards/cosine_scaled_reward": 0.3034628815948963, "rewards/format_reward": 1.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 20.195312976837158, "epoch": 1.335820895522388, "grad_norm": 2.6069984436035156, "learning_rate": 3.064024463337747e-07, "loss": -0.0006, "num_tokens": 48382522.0, "reward": 1.7208726704120636, "reward_std": 0.12136359186843038, "rewards/accuracy_reward": 0.3604910708963871, "rewards/cosine_scaled_reward": 0.3603815697133541, "rewards/format_reward": 1.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 20.24330425262451, "epoch": 1.3432835820895521, "grad_norm": 3.2656219005584717, "learning_rate": 3.004096428936461e-07, "loss": -0.0005, "num_tokens": 48544316.0, "reward": 1.7164078652858734, "reward_std": 0.12633045494445128, "rewards/accuracy_reward": 0.3582589328289032, "rewards/cosine_scaled_reward": 0.3581488821655512, "rewards/format_reward": 1.0, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 20.943081378936768, "epoch": 1.3507462686567164, "grad_norm": 1.9599158763885498, "learning_rate": 2.9445075501923176e-07, "loss": 0.0005, "num_tokens": 48698129.0, "reward": 1.6896116733551025, "reward_std": 0.055237259725728904, "rewards/accuracy_reward": 0.3404017873108387, "rewards/cosine_scaled_reward": 0.3492097966372967, "rewards/format_reward": 1.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 20.816965103149414, "epoch": 1.3582089552238805, "grad_norm": 2.1912848949432373, "learning_rate": 2.8852679527975685e-07, "loss": 0.0008, "num_tokens": 48846693.0, "reward": 1.5489892959594727, "reward_std": 0.07417789786538975, "rewards/accuracy_reward": 0.2745535736903548, "rewards/cosine_scaled_reward": 0.27443567011505365, "rewards/format_reward": 1.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 20.80022382736206, "epoch": 1.3656716417910448, "grad_norm": 2.0120460987091064, "learning_rate": 2.8263877030925277e-07, "loss": -0.0003, "num_tokens": 48993618.0, "reward": 1.6047926098108292, "reward_std": 0.08897415082109461, "rewards/accuracy_reward": 0.3024553554132581, "rewards/cosine_scaled_reward": 0.30233720503747463, "rewards/format_reward": 1.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 20.600447416305542, "epoch": 1.373134328358209, "grad_norm": 2.198028087615967, "learning_rate": 2.767876806355045e-07, "loss": 0.0011, "num_tokens": 49145180.0, "reward": 1.5378303229808807, "reward_std": 0.10318562714383006, "rewards/accuracy_reward": 0.2689732126891613, "rewards/cosine_scaled_reward": 0.2688570562750101, "rewards/format_reward": 1.0, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 20.30580472946167, "epoch": 1.3805970149253732, "grad_norm": 3.135545015335083, "learning_rate": 2.709745205100337e-07, "loss": 0.0007, "num_tokens": 49288630.0, "reward": 1.6762290000915527, "reward_std": 0.10257845791056752, "rewards/accuracy_reward": 0.3381696464493871, "rewards/cosine_scaled_reward": 0.3380592940375209, "rewards/format_reward": 1.0, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 20.150670051574707, "epoch": 1.3880597014925373, "grad_norm": 2.3859987258911133, "learning_rate": 2.652002777391507e-07, "loss": 0.0003, "num_tokens": 49430613.0, "reward": 1.7298022359609604, "reward_std": 0.09394080052152276, "rewards/accuracy_reward": 0.36495535634458065, "rewards/cosine_scaled_reward": 0.36484682094305754, "rewards/format_reward": 1.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 20.50892972946167, "epoch": 1.3955223880597014, "grad_norm": 2.2975528240203857, "learning_rate": 2.594659335161008e-07, "loss": -0.0025, "num_tokens": 49577965.0, "reward": 1.555689975619316, "reward_std": 0.07875558780506253, "rewards/accuracy_reward": 0.2779017873108387, "rewards/cosine_scaled_reward": 0.2777881510555744, "rewards/format_reward": 1.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 20.405134916305542, "epoch": 1.4029850746268657, "grad_norm": 1.598307490348816, "learning_rate": 2.5377246225433304e-07, "loss": -0.0001, "num_tokens": 49731048.0, "reward": 1.6873885244131088, "reward_std": 0.061549990693965384, "rewards/accuracy_reward": 0.34375, "rewards/cosine_scaled_reward": 0.3436384294182062, "rewards/format_reward": 1.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 20.441964864730835, "epoch": 1.4104477611940298, "grad_norm": 1.098617434501648, "learning_rate": 2.4812083142192323e-07, "loss": -0.0004, "num_tokens": 49875452.0, "reward": 1.6940844804048538, "reward_std": 0.03013577858569505, "rewards/accuracy_reward": 0.34709821455180645, "rewards/cosine_scaled_reward": 0.34698620066046715, "rewards/format_reward": 1.0, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 20.527902841567993, "epoch": 1.417910447761194, "grad_norm": 2.504321336746216, "learning_rate": 2.4251200137717543e-07, "loss": 0.0003, "num_tokens": 50048789.0, "reward": 1.6360465586185455, "reward_std": 0.08695389210011228, "rewards/accuracy_reward": 0.3180803582072258, "rewards/cosine_scaled_reward": 0.3179661240428686, "rewards/format_reward": 1.0, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 20.17299175262451, "epoch": 1.4253731343283582, "grad_norm": 2.5839359760284424, "learning_rate": 2.3694692520543292e-07, "loss": 0.0013, "num_tokens": 50197200.0, "reward": 1.6204270422458649, "reward_std": 0.07372181725033755, "rewards/accuracy_reward": 0.31026786006987095, "rewards/cosine_scaled_reward": 0.31015911884605885, "rewards/format_reward": 1.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 20.678572416305542, "epoch": 1.4328358208955223, "grad_norm": 2.182985544204712, "learning_rate": 2.314265485571235e-07, "loss": 0.0091, "num_tokens": 50350120.0, "reward": 1.66729336977005, "reward_std": 0.08215013663391346, "rewards/accuracy_reward": 0.3337053582072258, "rewards/cosine_scaled_reward": 0.3335879575461149, "rewards/format_reward": 1.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 20.617188692092896, "epoch": 1.4402985074626866, "grad_norm": 1.8787038326263428, "learning_rate": 2.2595180948706926e-07, "loss": 0.0005, "num_tokens": 50507097.0, "reward": 1.6829163581132889, "reward_std": 0.03772871130308175, "rewards/accuracy_reward": 0.34151786006987095, "rewards/cosine_scaled_reward": 0.34139839746057987, "rewards/format_reward": 1.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 20.189732551574707, "epoch": 1.4477611940298507, "grad_norm": 1.7796034812927246, "learning_rate": 2.2052363829508776e-07, "loss": -0.0002, "num_tokens": 50667171.0, "reward": 1.7521222680807114, "reward_std": 0.05102925866069086, "rewards/accuracy_reward": 0.37611608020961285, "rewards/cosine_scaled_reward": 0.37600609846413136, "rewards/format_reward": 1.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 20.324777603149414, "epoch": 1.455223880597015, "grad_norm": 2.2849700450897217, "learning_rate": 2.1514295736790838e-07, "loss": 0.0008, "num_tokens": 50814222.0, "reward": 1.735380157828331, "reward_std": 0.09453902203552644, "rewards/accuracy_reward": 0.36830357275903225, "rewards/cosine_scaled_reward": 0.3681926131248474, "rewards/format_reward": 0.9988839253783226, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 20.19084906578064, "epoch": 1.462686567164179, "grad_norm": 3.1908223628997803, "learning_rate": 2.0981068102243616e-07, "loss": 0.0004, "num_tokens": 50957049.0, "reward": 1.6472126096487045, "reward_std": 0.09634861818715024, "rewards/accuracy_reward": 0.32366071455180645, "rewards/cosine_scaled_reward": 0.323551825247705, "rewards/format_reward": 1.0, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 20.42299199104309, "epoch": 1.4701492537313432, "grad_norm": 1.6977643966674805, "learning_rate": 2.0452771535038515e-07, "loss": 0.0007, "num_tokens": 51099484.0, "reward": 1.54006627202034, "reward_std": 0.043066854786879105, "rewards/accuracy_reward": 0.27008928917348385, "rewards/cosine_scaled_reward": 0.26997692696750164, "rewards/format_reward": 1.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 20.802456378936768, "epoch": 1.4776119402985075, "grad_norm": 2.020225763320923, "learning_rate": 1.9929495806431023e-07, "loss": 0.0003, "num_tokens": 51261563.0, "reward": 1.6829185336828232, "reward_std": 0.058621840249692525, "rewards/accuracy_reward": 0.3415178619325161, "rewards/cosine_scaled_reward": 0.3414005693048239, "rewards/format_reward": 1.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 21.07924199104309, "epoch": 1.4850746268656716, "grad_norm": 1.6952190399169922, "learning_rate": 1.9411329834506286e-07, "loss": 0.0012, "num_tokens": 51404026.0, "reward": 1.4038956314325333, "reward_std": 0.053959765473592824, "rewards/accuracy_reward": 0.2020089328289032, "rewards/cosine_scaled_reward": 0.20188664738088846, "rewards/format_reward": 1.0, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 20.199777841567993, "epoch": 1.4925373134328357, "grad_norm": 2.445509433746338, "learning_rate": 1.8898361669069497e-07, "loss": -0.0013, "num_tokens": 51564021.0, "reward": 1.640516072511673, "reward_std": 0.09235968008678697, "rewards/accuracy_reward": 0.32031249813735485, "rewards/cosine_scaled_reward": 0.3202035166323185, "rewards/format_reward": 1.0, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 20.577009916305542, "epoch": 1.5, "grad_norm": 1.4149043560028076, "learning_rate": 1.8390678476684142e-07, "loss": 0.0007, "num_tokens": 51712674.0, "reward": 1.5110464841127396, "reward_std": 0.038402879612469576, "rewards/accuracy_reward": 0.2555803582072258, "rewards/cosine_scaled_reward": 0.2554660662135575, "rewards/format_reward": 1.0, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 20.31361699104309, "epoch": 1.5074626865671643, "grad_norm": 1.3769646883010864, "learning_rate": 1.7888366525859967e-07, "loss": 0.0003, "num_tokens": 51862699.0, "reward": 1.7298002988100052, "reward_std": 0.03547355129772001, "rewards/accuracy_reward": 0.3649553656578064, "rewards/cosine_scaled_reward": 0.3648448847234249, "rewards/format_reward": 1.0, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 20.623884916305542, "epoch": 1.5149253731343284, "grad_norm": 3.1723361015319824, "learning_rate": 1.7391511172393848e-07, "loss": 0.0012, "num_tokens": 52014514.0, "reward": 1.5690792500972748, "reward_std": 0.09754315220763488, "rewards/accuracy_reward": 0.28459821082651615, "rewards/cosine_scaled_reward": 0.2844809675589204, "rewards/format_reward": 1.0, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 20.436384916305542, "epoch": 1.5223880597014925, "grad_norm": 1.6968096494674683, "learning_rate": 1.690019684486557e-07, "loss": -0.0003, "num_tokens": 52161273.0, "reward": 1.7119403928518295, "reward_std": 0.031414411859074676, "rewards/accuracy_reward": 0.35602678544819355, "rewards/cosine_scaled_reward": 0.3559135627001524, "rewards/format_reward": 1.0, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 20.014509677886963, "epoch": 1.5298507462686568, "grad_norm": 2.15055513381958, "learning_rate": 1.6414507030291246e-07, "loss": 0.0008, "num_tokens": 52310342.0, "reward": 1.7543574571609497, "reward_std": 0.06756016856554936, "rewards/accuracy_reward": 0.3772321417927742, "rewards/cosine_scaled_reward": 0.3771252781152725, "rewards/format_reward": 1.0, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 20.49553632736206, "epoch": 1.537313432835821, "grad_norm": 2.6926231384277344, "learning_rate": 1.5934524259936753e-07, "loss": -0.0012, "num_tokens": 52466618.0, "reward": 1.5155121833086014, "reward_std": 0.1101713702082634, "rewards/accuracy_reward": 0.2578125009313226, "rewards/cosine_scaled_reward": 0.25769960321485996, "rewards/format_reward": 1.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 20.606027364730835, "epoch": 1.544776119402985, "grad_norm": 2.4713823795318604, "learning_rate": 1.5460330095293443e-07, "loss": 0.0008, "num_tokens": 52628361.0, "reward": 1.767742782831192, "reward_std": 0.07515440785066829, "rewards/accuracy_reward": 0.38392857275903225, "rewards/cosine_scaled_reward": 0.38381412625312805, "rewards/format_reward": 1.0, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 20.731027603149414, "epoch": 1.5522388059701493, "grad_norm": 2.261600971221924, "learning_rate": 1.4992005114218804e-07, "loss": 0.0009, "num_tokens": 52775840.0, "reward": 1.6472041308879852, "reward_std": 0.07109666805187231, "rewards/accuracy_reward": 0.32366071827709675, "rewards/cosine_scaled_reward": 0.32354335859417915, "rewards/format_reward": 1.0, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 21.045759677886963, "epoch": 1.5597014925373134, "grad_norm": 0.7100464105606079, "learning_rate": 1.4529628897244212e-07, "loss": 0.0004, "num_tokens": 52938273.0, "reward": 1.6248737573623657, "reward_std": 0.012628240511414646, "rewards/accuracy_reward": 0.31250000838190317, "rewards/cosine_scaled_reward": 0.31237365305423737, "rewards/format_reward": 1.0, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 20.343751192092896, "epoch": 1.5671641791044775, "grad_norm": 1.3848381042480469, "learning_rate": 1.4073280014052074e-07, "loss": 0.0004, "num_tokens": 53086589.0, "reward": 1.736496239900589, "reward_std": 0.02916058116019471, "rewards/accuracy_reward": 0.36830357648432255, "rewards/cosine_scaled_reward": 0.36819261126220226, "rewards/format_reward": 1.0, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 20.465402841567993, "epoch": 1.5746268656716418, "grad_norm": 2.789355993270874, "learning_rate": 1.3623036010124845e-07, "loss": 0.0004, "num_tokens": 53244486.0, "reward": 1.7097086608409882, "reward_std": 0.08943129004910588, "rewards/accuracy_reward": 0.3549107173457742, "rewards/cosine_scaled_reward": 0.35479787550866604, "rewards/format_reward": 1.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 20.30022406578064, "epoch": 1.582089552238806, "grad_norm": 2.885791778564453, "learning_rate": 1.3178973393568056e-07, "loss": 0.0001, "num_tokens": 53394467.0, "reward": 1.6762287318706512, "reward_std": 0.08162061781850127, "rewards/accuracy_reward": 0.33816964738070965, "rewards/cosine_scaled_reward": 0.3380589783191681, "rewards/format_reward": 1.0, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 20.32366132736206, "epoch": 1.5895522388059702, "grad_norm": 2.8630964756011963, "learning_rate": 1.2741167622109555e-07, "loss": 0.0, "num_tokens": 53546981.0, "reward": 1.6427462249994278, "reward_std": 0.058320232714407894, "rewards/accuracy_reward": 0.3214285708963871, "rewards/cosine_scaled_reward": 0.3213176503777504, "rewards/format_reward": 1.0, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 20.03236675262451, "epoch": 1.5970149253731343, "grad_norm": 2.2839713096618652, "learning_rate": 1.230969309027739e-07, "loss": -0.001, "num_tokens": 53698458.0, "reward": 1.6181965470314026, "reward_std": 0.07244142005220056, "rewards/accuracy_reward": 0.30915178707800806, "rewards/cosine_scaled_reward": 0.3090446996502578, "rewards/format_reward": 1.0, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 20.234375953674316, "epoch": 1.6044776119402986, "grad_norm": 2.546302556991577, "learning_rate": 1.1884623116758119e-07, "loss": 0.0002, "num_tokens": 53849116.0, "reward": 1.651675522327423, "reward_std": 0.0710948963102993, "rewards/accuracy_reward": 0.32589285634458065, "rewards/cosine_scaled_reward": 0.3257826119661331, "rewards/format_reward": 1.0, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 20.366072177886963, "epoch": 1.6119402985074627, "grad_norm": 2.6176400184631348, "learning_rate": 1.1466029931938181e-07, "loss": 0.0007, "num_tokens": 53999708.0, "reward": 1.6025667041540146, "reward_std": 0.055539907814715406, "rewards/accuracy_reward": 0.30133928917348385, "rewards/cosine_scaled_reward": 0.30122734420001507, "rewards/format_reward": 1.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 20.61830449104309, "epoch": 1.6194029850746268, "grad_norm": 2.8905839920043945, "learning_rate": 1.1053984665630023e-07, "loss": 0.0013, "num_tokens": 54147318.0, "reward": 1.6003320217132568, "reward_std": 0.12910877341846927, "rewards/accuracy_reward": 0.30022321455180645, "rewards/cosine_scaled_reward": 0.3001087475568056, "rewards/format_reward": 1.0, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 20.37834930419922, "epoch": 1.626865671641791, "grad_norm": 2.013456106185913, "learning_rate": 1.0648557334985308e-07, "loss": 0.001, "num_tokens": 54315337.0, "reward": 1.3972100466489792, "reward_std": 0.08131316915778086, "rewards/accuracy_reward": 0.19866071827709675, "rewards/cosine_scaled_reward": 0.1985492706298828, "rewards/format_reward": 1.0, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 20.402902841567993, "epoch": 1.6343283582089554, "grad_norm": 2.0653622150421143, "learning_rate": 1.024981683259723e-07, "loss": 0.0005, "num_tokens": 54458594.0, "reward": 1.7476565390825272, "reward_std": 0.05102954798443449, "rewards/accuracy_reward": 0.3738839291036129, "rewards/cosine_scaled_reward": 0.37377250753343105, "rewards/format_reward": 1.0, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 20.71651840209961, "epoch": 1.6417910447761193, "grad_norm": 1.5992612838745117, "learning_rate": 9.857830914793824e-08, "loss": 0.0009, "num_tokens": 54607180.0, "reward": 1.7409540712833405, "reward_std": 0.04907748210338525, "rewards/accuracy_reward": 0.37053571827709675, "rewards/cosine_scaled_reward": 0.37041825242340565, "rewards/format_reward": 1.0, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 20.84709906578064, "epoch": 1.6492537313432836, "grad_norm": 1.980920672416687, "learning_rate": 9.472666190124456e-08, "loss": 0.0006, "num_tokens": 54749371.0, "reward": 1.746531069278717, "reward_std": 0.06538078441796102, "rewards/accuracy_reward": 0.37388392724096775, "rewards/cosine_scaled_reward": 0.3737631347030401, "rewards/format_reward": 0.9988839253783226, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 20.19642925262451, "epoch": 1.6567164179104479, "grad_norm": 2.4586503505706787, "learning_rate": 9.094388108041301e-08, "loss": -0.0002, "num_tokens": 54894707.0, "reward": 1.6717657148838043, "reward_std": 0.07176896455235493, "rewards/accuracy_reward": 0.3359375037252903, "rewards/cosine_scaled_reward": 0.3358281459659338, "rewards/format_reward": 1.0, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 19.99776864051819, "epoch": 1.664179104477612, "grad_norm": 1.5232560634613037, "learning_rate": 8.723060947777777e-08, "loss": 0.001, "num_tokens": 55046777.0, "reward": 1.4195362627506256, "reward_std": 0.05862198262564533, "rewards/accuracy_reward": 0.2098214307334274, "rewards/cosine_scaled_reward": 0.2097147584427148, "rewards/format_reward": 1.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 20.31808066368103, "epoch": 1.671641791044776, "grad_norm": 1.8125054836273193, "learning_rate": 8.358747807425826e-08, "loss": 0.0004, "num_tokens": 55197894.0, "reward": 1.6070322841405869, "reward_std": 0.06688734842464328, "rewards/accuracy_reward": 0.30357143096625805, "rewards/cosine_scaled_reward": 0.3034607693552971, "rewards/format_reward": 1.0, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 20.676340341567993, "epoch": 1.6791044776119404, "grad_norm": 2.117981195449829, "learning_rate": 8.001510593213945e-08, "loss": 0.0009, "num_tokens": 55361956.0, "reward": 1.5824733972549438, "reward_std": 0.061247594597631405, "rewards/accuracy_reward": 0.2912946464493871, "rewards/cosine_scaled_reward": 0.2911787135526538, "rewards/format_reward": 1.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 20.402902603149414, "epoch": 1.6865671641791045, "grad_norm": 2.322235584259033, "learning_rate": 7.651410008987697e-08, "loss": -0.0005, "num_tokens": 55534853.0, "reward": 1.5400669574737549, "reward_std": 0.07680402030835864, "rewards/accuracy_reward": 0.2700892873108387, "rewards/cosine_scaled_reward": 0.26997758261859417, "rewards/format_reward": 1.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 20.39732265472412, "epoch": 1.6940298507462686, "grad_norm": 2.2915542125701904, "learning_rate": 7.308505545894566e-08, "loss": 0.0015, "num_tokens": 55689753.0, "reward": 1.5065849125385284, "reward_std": 0.05005305098056567, "rewards/accuracy_reward": 0.2533482164144516, "rewards/cosine_scaled_reward": 0.25323665514588356, "rewards/format_reward": 1.0, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 20.694197177886963, "epoch": 1.7014925373134329, "grad_norm": 0.9342123866081238, "learning_rate": 6.972855472274852e-08, "loss": 0.0018, "num_tokens": 55839255.0, "reward": 1.5266692787408829, "reward_std": 0.020895601193345215, "rewards/accuracy_reward": 0.2633928619325161, "rewards/cosine_scaled_reward": 0.2632763609290123, "rewards/format_reward": 1.0, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 20.92299199104309, "epoch": 1.7089552238805972, "grad_norm": 1.9537469148635864, "learning_rate": 6.644516823760437e-08, "loss": 0.0, "num_tokens": 55992818.0, "reward": 1.6561282128095627, "reward_std": 0.07515494169327752, "rewards/accuracy_reward": 0.3281250037252903, "rewards/cosine_scaled_reward": 0.3280031867325306, "rewards/format_reward": 1.0, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 20.45870614051819, "epoch": 1.716417910447761, "grad_norm": 2.364659547805786, "learning_rate": 6.323545393582847e-08, "loss": 0.0011, "num_tokens": 56149261.0, "reward": 1.718636766076088, "reward_std": 0.12023507321784876, "rewards/accuracy_reward": 0.35937500186264515, "rewards/cosine_scaled_reward": 0.35926168598234653, "rewards/format_reward": 1.0, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 20.56584858894348, "epoch": 1.7238805970149254, "grad_norm": 3.120504379272461, "learning_rate": 6.009995723092653e-08, "loss": -0.0001, "num_tokens": 56305584.0, "reward": 1.8458667993545532, "reward_std": 0.08213974455068751, "rewards/accuracy_reward": 0.42299107275903225, "rewards/cosine_scaled_reward": 0.42287569493055344, "rewards/format_reward": 1.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 20.895090103149414, "epoch": 1.7313432835820897, "grad_norm": 1.5434519052505493, "learning_rate": 5.703921092491393e-08, "loss": 0.0003, "num_tokens": 56468698.0, "reward": 1.6784496754407883, "reward_std": 0.029160332879651918, "rewards/accuracy_reward": 0.3392857192084193, "rewards/cosine_scaled_reward": 0.3391639143228531, "rewards/format_reward": 1.0, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 20.31584906578064, "epoch": 1.7388059701492538, "grad_norm": 2.2976887226104736, "learning_rate": 5.405373511777939e-08, "loss": 0.0003, "num_tokens": 56622173.0, "reward": 1.6114967614412308, "reward_std": 0.07515410965470437, "rewards/accuracy_reward": 0.30580357275903225, "rewards/cosine_scaled_reward": 0.30569313652813435, "rewards/format_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 20.062500715255737, "epoch": 1.7462686567164178, "grad_norm": 2.2692880630493164, "learning_rate": 5.114403711910631e-08, "loss": 0.0004, "num_tokens": 56781293.0, "reward": 1.6472149044275284, "reward_std": 0.07417689614470646, "rewards/accuracy_reward": 0.32194368727505207, "rewards/cosine_scaled_reward": 0.328018419444561, "rewards/format_reward": 1.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 20.42745614051819, "epoch": 1.7537313432835822, "grad_norm": 1.9118342399597168, "learning_rate": 4.831061136186787e-08, "loss": -0.0002, "num_tokens": 56925372.0, "reward": 1.6449773013591766, "reward_std": 0.06771036455336343, "rewards/accuracy_reward": 0.3225446417927742, "rewards/cosine_scaled_reward": 0.322432579472661, "rewards/format_reward": 1.0, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 20.910715103149414, "epoch": 1.7611940298507462, "grad_norm": 2.7811856269836426, "learning_rate": 4.5553939318410004e-08, "loss": 0.0011, "num_tokens": 57072716.0, "reward": 1.825772985816002, "reward_std": 0.09235973202066816, "rewards/accuracy_reward": 0.4129464291036129, "rewards/cosine_scaled_reward": 0.4128264728933573, "rewards/format_reward": 1.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 20.27678656578064, "epoch": 1.7686567164179103, "grad_norm": 1.4767699241638184, "learning_rate": 4.287448941863692e-08, "loss": 0.0006, "num_tokens": 57219380.0, "reward": 1.6014523804187775, "reward_std": 0.04930526966539617, "rewards/accuracy_reward": 0.30133928917348385, "rewards/cosine_scaled_reward": 0.30122908018529415, "rewards/format_reward": 0.9988839253783226, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 20.325893878936768, "epoch": 1.7761194029850746, "grad_norm": 3.0593392848968506, "learning_rate": 4.0272716970412516e-08, "loss": -0.0001, "num_tokens": 57371640.0, "reward": 1.5713180601596832, "reward_std": 0.124595548491925, "rewards/accuracy_reward": 0.2857142901048064, "rewards/cosine_scaled_reward": 0.28560364712029696, "rewards/format_reward": 1.0, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 20.405134677886963, "epoch": 1.783582089552239, "grad_norm": 2.0308165550231934, "learning_rate": 3.774906408219197e-08, "loss": 0.0018, "num_tokens": 57518011.0, "reward": 1.6494415253400803, "reward_std": 0.04959785374813919, "rewards/accuracy_reward": 0.3247767873108387, "rewards/cosine_scaled_reward": 0.32466465793550014, "rewards/format_reward": 1.0, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 20.67745614051819, "epoch": 1.7910447761194028, "grad_norm": 1.7336244583129883, "learning_rate": 3.5303959587895896e-08, "loss": 0.0034, "num_tokens": 57677018.0, "reward": 1.591400220990181, "reward_std": 0.03742815442538472, "rewards/accuracy_reward": 0.29575893096625805, "rewards/cosine_scaled_reward": 0.2956412099301815, "rewards/format_reward": 1.0, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 20.30803656578064, "epoch": 1.7985074626865671, "grad_norm": 2.150383234024048, "learning_rate": 3.293781897404063e-08, "loss": -0.001, "num_tokens": 57828678.0, "reward": 1.6829252541065216, "reward_std": 0.06816608617647546, "rewards/accuracy_reward": 0.34151785634458065, "rewards/cosine_scaled_reward": 0.34140734374523163, "rewards/format_reward": 1.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 20.556920528411865, "epoch": 1.8059701492537314, "grad_norm": 1.195513367652893, "learning_rate": 3.065104430913601e-08, "loss": 0.0005, "num_tokens": 57977305.0, "reward": 1.6561355143785477, "reward_std": 0.041787590547230025, "rewards/accuracy_reward": 0.3281250074505806, "rewards/cosine_scaled_reward": 0.328010406345129, "rewards/format_reward": 1.0, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 20.666295289993286, "epoch": 1.8134328358208955, "grad_norm": 2.063978672027588, "learning_rate": 2.8444024175363733e-08, "loss": 0.0002, "num_tokens": 58129758.0, "reward": 1.6315756142139435, "reward_std": 0.07628487978815457, "rewards/accuracy_reward": 0.31584821455180645, "rewards/cosine_scaled_reward": 0.31572734005749226, "rewards/format_reward": 1.0, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 20.343751192092896, "epoch": 1.8208955223880596, "grad_norm": 2.139723777770996, "learning_rate": 2.6317133602547335e-08, "loss": -0.0, "num_tokens": 58286354.0, "reward": 1.6070320904254913, "reward_std": 0.06395891746558391, "rewards/accuracy_reward": 0.3035714291036129, "rewards/cosine_scaled_reward": 0.30346059799194336, "rewards/format_reward": 1.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 20.492188453674316, "epoch": 1.828358208955224, "grad_norm": 1.4515061378479004, "learning_rate": 2.4270734004424643e-08, "loss": 0.0004, "num_tokens": 58436883.0, "reward": 1.624885842204094, "reward_std": 0.05441444956320396, "rewards/accuracy_reward": 0.3125000074505806, "rewards/cosine_scaled_reward": 0.3123858105391264, "rewards/format_reward": 1.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 20.328125953674316, "epoch": 1.835820895522388, "grad_norm": 1.8083621263504028, "learning_rate": 2.2305173117234233e-08, "loss": -0.0003, "num_tokens": 58590025.0, "reward": 1.6539071798324585, "reward_std": 0.06463310816476309, "rewards/accuracy_reward": 0.32700893096625805, "rewards/cosine_scaled_reward": 0.3268981762230396, "rewards/format_reward": 1.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 20.55580449104309, "epoch": 1.8432835820895521, "grad_norm": 2.2442469596862793, "learning_rate": 2.0420784940626156e-08, "loss": 0.0013, "num_tokens": 58742411.0, "reward": 1.714171290397644, "reward_std": 0.08214285858039716, "rewards/accuracy_reward": 0.3571428544819355, "rewards/cosine_scaled_reward": 0.35702834837138653, "rewards/format_reward": 1.0, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 20.25111675262451, "epoch": 1.8507462686567164, "grad_norm": 2.635986804962158, "learning_rate": 1.861788968090683e-08, "loss": -0.0002, "num_tokens": 58885124.0, "reward": 1.6583720594644547, "reward_std": 0.09363627548930253, "rewards/accuracy_reward": 0.32924107275903225, "rewards/cosine_scaled_reward": 0.32913094013929367, "rewards/format_reward": 1.0, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 20.059152364730835, "epoch": 1.8582089552238807, "grad_norm": 2.7098944187164307, "learning_rate": 1.68967936966275e-08, "loss": 0.0005, "num_tokens": 59042857.0, "reward": 1.647214189171791, "reward_std": 0.0797318636930413, "rewards/accuracy_reward": 0.32366071455180645, "rewards/cosine_scaled_reward": 0.3235534243285656, "rewards/format_reward": 1.0, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 20.43861699104309, "epoch": 1.8656716417910446, "grad_norm": 2.948129177093506, "learning_rate": 1.525778944652617e-08, "loss": -0.0017, "num_tokens": 59189866.0, "reward": 1.6047987192869186, "reward_std": 0.09995094314217567, "rewards/accuracy_reward": 0.30245535261929035, "rewards/cosine_scaled_reward": 0.30234329774975777, "rewards/format_reward": 1.0, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 20.51116156578064, "epoch": 1.873134328358209, "grad_norm": 1.8753550052642822, "learning_rate": 1.3701155439831248e-08, "loss": 0.0003, "num_tokens": 59351876.0, "reward": 1.4842608720064163, "reward_std": 0.05374052021700493, "rewards/accuracy_reward": 0.24218750186264515, "rewards/cosine_scaled_reward": 0.2420733030885458, "rewards/format_reward": 1.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 19.960938692092896, "epoch": 1.8805970149253732, "grad_norm": 3.710313320159912, "learning_rate": 1.222715618893555e-08, "loss": -0.0011, "num_tokens": 59501249.0, "reward": 1.6673045605421066, "reward_std": 0.11986687686294317, "rewards/accuracy_reward": 0.33370536379516125, "rewards/cosine_scaled_reward": 0.3335991408675909, "rewards/format_reward": 1.0, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 20.74553632736206, "epoch": 1.8880597014925373, "grad_norm": 2.096618890762329, "learning_rate": 1.0836042164448944e-08, "loss": 0.0005, "num_tokens": 59655845.0, "reward": 1.5623822510242462, "reward_std": 0.06011815097401296, "rewards/accuracy_reward": 0.2812500037252903, "rewards/cosine_scaled_reward": 0.28113218024373055, "rewards/format_reward": 1.0, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 20.500000953674316, "epoch": 1.8955223880597014, "grad_norm": 2.3821330070495605, "learning_rate": 9.528049752636714e-09, "loss": 0.0004, "num_tokens": 59810181.0, "reward": 1.747654750943184, "reward_std": 0.0755647381696889, "rewards/accuracy_reward": 0.37611607275903225, "rewards/cosine_scaled_reward": 0.37600288540124893, "rewards/format_reward": 0.9955357164144516, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 20.908483266830444, "epoch": 1.9029850746268657, "grad_norm": 1.859230875968933, "learning_rate": 8.303401215251581e-09, "loss": 0.0002, "num_tokens": 59954931.0, "reward": 1.600327655673027, "reward_std": 0.03937964968498875, "rewards/accuracy_reward": 0.3002232201397419, "rewards/cosine_scaled_reward": 0.3001043573021889, "rewards/format_reward": 1.0, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 20.248884677886963, "epoch": 1.9104477611940298, "grad_norm": 2.67840313911438, "learning_rate": 7.1623046517656495e-09, "loss": 0.0004, "num_tokens": 60097050.0, "reward": 1.758818194270134, "reward_std": 0.06981627906458954, "rewards/accuracy_reward": 0.3794642873108387, "rewards/cosine_scaled_reward": 0.37935382314026356, "rewards/format_reward": 1.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 20.142858266830444, "epoch": 1.917910447761194, "grad_norm": 2.1021740436553955, "learning_rate": 6.104953964008897e-09, "loss": 0.0007, "num_tokens": 60255522.0, "reward": 1.5333734452724457, "reward_std": 0.08875712241180622, "rewards/accuracy_reward": 0.26674107648432255, "rewards/cosine_scaled_reward": 0.26663233898580074, "rewards/format_reward": 1.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 20.85267949104309, "epoch": 1.9253731343283582, "grad_norm": 2.7934389114379883, "learning_rate": 5.131528823220099e-09, "loss": -0.0019, "num_tokens": 60410742.0, "reward": 1.670636236667633, "reward_std": 0.08307532503371817, "rewards/accuracy_reward": 0.3348214328289032, "rewards/cosine_scaled_reward": 0.3358147069811821, "rewards/format_reward": 1.0, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 20.453125953674316, "epoch": 1.9328358208955225, "grad_norm": 2.496358871459961, "learning_rate": 4.242194639516416e-09, "loss": 0.0009, "num_tokens": 60571516.0, "reward": 1.6360480785369873, "reward_std": 0.07417896673651114, "rewards/accuracy_reward": 0.31808036006987095, "rewards/cosine_scaled_reward": 0.3179676216095686, "rewards/format_reward": 1.0, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 20.14955425262451, "epoch": 1.9402985074626866, "grad_norm": 1.2594853639602661, "learning_rate": 3.4371025337855407e-09, "loss": 0.0003, "num_tokens": 60722354.0, "reward": 1.586945116519928, "reward_std": 0.03156871721012067, "rewards/accuracy_reward": 0.29352678975556046, "rewards/cosine_scaled_reward": 0.29341828147880733, "rewards/format_reward": 1.0, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 20.205358266830444, "epoch": 1.9477611940298507, "grad_norm": 1.8333399295806885, "learning_rate": 2.7163893120066285e-09, "loss": 0.0005, "num_tokens": 60871290.0, "reward": 1.5088191330432892, "reward_std": 0.052982652708983835, "rewards/accuracy_reward": 0.2544642798602581, "rewards/cosine_scaled_reward": 0.25435481034219265, "rewards/format_reward": 1.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 20.30803656578064, "epoch": 1.955223880597015, "grad_norm": 2.2571942806243896, "learning_rate": 2.080177442003117e-09, "loss": -0.0004, "num_tokens": 61015958.0, "reward": 1.6137285828590393, "reward_std": 0.1302357604727149, "rewards/accuracy_reward": 0.3069196389988065, "rewards/cosine_scaled_reward": 0.30680886935442686, "rewards/format_reward": 1.0, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 20.48214340209961, "epoch": 1.962686567164179, "grad_norm": 1.8233392238616943, "learning_rate": 1.5285750326325953e-09, "loss": 0.0001, "num_tokens": 61160438.0, "reward": 1.6360481083393097, "reward_std": 0.04794773051276735, "rewards/accuracy_reward": 0.31808035634458065, "rewards/cosine_scaled_reward": 0.31796768493950367, "rewards/format_reward": 1.0, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 21.101563215255737, "epoch": 1.9701492537313432, "grad_norm": 1.5878843069076538, "learning_rate": 1.0616758154161631e-09, "loss": 0.0006, "num_tokens": 61316161.0, "reward": 1.6092515885829926, "reward_std": 0.0644803009436572, "rewards/accuracy_reward": 0.30468749441206455, "rewards/cosine_scaled_reward": 0.30456401966512203, "rewards/format_reward": 1.0, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 20.38839364051819, "epoch": 1.9776119402985075, "grad_norm": 2.332639455795288, "learning_rate": 6.795591286109514e-10, "loss": -0.0004, "num_tokens": 61460901.0, "reward": 1.490959793329239, "reward_std": 0.07921304133695628, "rewards/accuracy_reward": 0.24553572060540318, "rewards/cosine_scaled_reward": 0.24542399495840073, "rewards/format_reward": 1.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 20.500001192092896, "epoch": 1.9850746268656716, "grad_norm": 1.9155348539352417, "learning_rate": 3.8228990372862756e-10, "loss": 0.0008, "num_tokens": 61616757.0, "reward": 1.624886617064476, "reward_std": 0.07289814859302624, "rewards/accuracy_reward": 0.3125, "rewards/cosine_scaled_reward": 0.31238655652850866, "rewards/format_reward": 1.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 20.14851450920105, "epoch": 1.9925373134328357, "grad_norm": 2.2550103664398193, "learning_rate": 1.6991865450188825e-10, "loss": 0.0002, "num_tokens": 61771838.0, "reward": 1.6784630566835403, "reward_std": 0.06463183751365165, "rewards/accuracy_reward": 0.3392857164144516, "rewards/cosine_scaled_reward": 0.3391772899776697, "rewards/format_reward": 1.0, "step": 266 }, { "epoch": 1.9925373134328357, "step": 266, "total_flos": 0.0, "train_loss": 0.032786881700187384, "train_runtime": 16819.5455, "train_samples_per_second": 1.783, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 268, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }