diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,43553 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997759689343589, + "eval_steps": 500, + "global_step": 3347, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 1683.430908203125, + "epoch": 0.00029870808752146963, + "grad_norm": 0.05959290266036987, + "kl": 0.0, + "learning_rate": 2.9850746268656717e-09, + "loss": 0.027, + "reward": 0.13392858020961285, + "reward_std": 0.1593554001301527, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.01116071455180645, + "step": 1 + }, + { + "completion_length": 1679.0982971191406, + "epoch": 0.0005974161750429393, + "grad_norm": 0.05722633749246597, + "kl": 0.0, + "learning_rate": 5.970149253731343e-09, + "loss": 0.0289, + "reward": 0.1428571529686451, + "reward_std": 0.11902236379683018, + "rewards/accuracy_reward": 0.13616072107106447, + "rewards/format_reward": 0.006696428870782256, + "step": 2 + }, + { + "completion_length": 1742.7589721679688, + "epoch": 0.0008961242625644089, + "grad_norm": 0.03285929933190346, + "kl": 2.8014183044433594e-05, + "learning_rate": 8.955223880597015e-09, + "loss": 0.0138, + "reward": 0.07812500232830644, + "reward_std": 0.08489762526005507, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.006696428870782256, + "step": 3 + }, + { + "completion_length": 1732.2344055175781, + "epoch": 0.0011948323500858785, + "grad_norm": 0.04361572116613388, + "kl": 3.49581241607666e-05, + "learning_rate": 1.1940298507462687e-08, + "loss": 0.0193, + "reward": 0.1741071529686451, + "reward_std": 0.11444765329360962, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.006696428870782256, + "step": 4 + }, + { + "completion_length": 1611.8125915527344, + "epoch": 0.0014935404376073482, + "grad_norm": 0.03971505165100098, + "kl": 3.8623809814453125e-05, + "learning_rate": 1.4925373134328357e-08, + "loss": 0.0121, + "reward": 0.1026785746216774, + "reward_std": 0.05782640352845192, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.006696428870782256, + "step": 5 + }, + { + "completion_length": 1531.1942443847656, + "epoch": 0.0017922485251288178, + "grad_norm": 0.06698373705148697, + "kl": 4.011392593383789e-05, + "learning_rate": 1.791044776119403e-08, + "loss": 0.0212, + "reward": 0.1517857201397419, + "reward_std": 0.18100670352578163, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.013392857741564512, + "step": 6 + }, + { + "completion_length": 1738.8036499023438, + "epoch": 0.0020909566126502874, + "grad_norm": 0.026595046743750572, + "kl": 3.692507743835449e-05, + "learning_rate": 2.08955223880597e-08, + "loss": 0.0104, + "reward": 0.03348214388824999, + "reward_std": 0.05118321720510721, + "rewards/accuracy_reward": 0.03348214388824999, + "rewards/format_reward": 0.0, + "step": 7 + }, + { + "completion_length": 1692.7768859863281, + "epoch": 0.002389664700171757, + "grad_norm": 0.05236653611063957, + "kl": 3.600120544433594e-05, + "learning_rate": 2.3880597014925373e-08, + "loss": 0.0324, + "reward": 0.11607143143191934, + "reward_std": 0.10519563034176826, + "rewards/accuracy_reward": 0.10267857438884676, + "rewards/format_reward": 0.013392857741564512, + "step": 8 + }, + { + "completion_length": 1700.5826721191406, + "epoch": 0.0026883727876932267, + "grad_norm": 0.0650784894824028, + "kl": 3.337860107421875e-05, + "learning_rate": 2.6865671641791042e-08, + "loss": 0.047, + "reward": 0.14732144260779023, + "reward_std": 0.1892076302319765, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.011160715017467737, + "step": 9 + }, + { + "completion_length": 1740.0379943847656, + "epoch": 0.0029870808752146963, + "grad_norm": 0.06384691596031189, + "kl": 3.6776065826416016e-05, + "learning_rate": 2.9850746268656714e-08, + "loss": 0.0262, + "reward": 0.16294643469154835, + "reward_std": 0.17173518240451813, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.004464285913854837, + "step": 10 + }, + { + "completion_length": 1662.9799499511719, + "epoch": 0.003285788962736166, + "grad_norm": 0.06471554934978485, + "kl": 3.585219383239746e-05, + "learning_rate": 3.2835820895522386e-08, + "loss": 0.0296, + "reward": 0.11383929196745157, + "reward_std": 0.13436322286725044, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0066964291036129, + "step": 11 + }, + { + "completion_length": 1668.0045471191406, + "epoch": 0.0035844970502576356, + "grad_norm": 0.04711868613958359, + "kl": 2.8699636459350586e-05, + "learning_rate": 3.582089552238806e-08, + "loss": 0.0146, + "reward": 0.13616071920841932, + "reward_std": 0.11797658074647188, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.008928571827709675, + "step": 12 + }, + { + "completion_length": 1782.8482971191406, + "epoch": 0.003883205137779105, + "grad_norm": 0.049867287278175354, + "kl": 3.591179847717285e-05, + "learning_rate": 3.880597014925373e-08, + "loss": 0.0072, + "reward": 0.10937500232830644, + "reward_std": 0.12336567975580692, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.004464285913854837, + "step": 13 + }, + { + "completion_length": 1692.6473999023438, + "epoch": 0.004181913225300575, + "grad_norm": 0.05358385667204857, + "kl": 4.0143728256225586e-05, + "learning_rate": 4.17910447761194e-08, + "loss": 0.0334, + "reward": 0.1651785783469677, + "reward_std": 0.12319108843803406, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.0022321429569274187, + "step": 14 + }, + { + "completion_length": 1588.8259887695312, + "epoch": 0.004480621312822045, + "grad_norm": 0.07240007072687149, + "kl": 4.106760025024414e-05, + "learning_rate": 4.477611940298507e-08, + "loss": 0.0325, + "reward": 0.191964291036129, + "reward_std": 0.15889086574316025, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.006696428870782256, + "step": 15 + }, + { + "completion_length": 1752.0692749023438, + "epoch": 0.004779329400343514, + "grad_norm": 0.04957496374845505, + "kl": 3.731250762939453e-05, + "learning_rate": 4.776119402985075e-08, + "loss": 0.0262, + "reward": 0.0937500037252903, + "reward_std": 0.11411896906793118, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0022321429569274187, + "step": 16 + }, + { + "completion_length": 1623.2590026855469, + "epoch": 0.005078037487864984, + "grad_norm": 0.04496748745441437, + "kl": 2.905726432800293e-05, + "learning_rate": 5.074626865671641e-08, + "loss": 0.0139, + "reward": 0.12723215227015316, + "reward_std": 0.10904676001518965, + "rewards/accuracy_reward": 0.12053572200238705, + "rewards/format_reward": 0.006696428870782256, + "step": 17 + }, + { + "completion_length": 1692.6898193359375, + "epoch": 0.005376745575386453, + "grad_norm": 0.05351587012410164, + "kl": 3.8564205169677734e-05, + "learning_rate": 5.3731343283582085e-08, + "loss": 0.0206, + "reward": 0.14508929662406445, + "reward_std": 0.13898197375237942, + "rewards/accuracy_reward": 0.13616071920841932, + "rewards/format_reward": 0.008928571827709675, + "step": 18 + }, + { + "completion_length": 1586.1451721191406, + "epoch": 0.0056754536629079234, + "grad_norm": 0.06026902794837952, + "kl": 3.5881996154785156e-05, + "learning_rate": 5.671641791044776e-08, + "loss": 0.0205, + "reward": 0.196428582072258, + "reward_std": 0.12449068203568459, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.006696428870782256, + "step": 19 + }, + { + "completion_length": 1744.7500610351562, + "epoch": 0.005974161750429393, + "grad_norm": 0.052619386464357376, + "kl": 4.26173210144043e-05, + "learning_rate": 5.970149253731343e-08, + "loss": 0.0211, + "reward": 0.1696428619325161, + "reward_std": 0.13681860640645027, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.0022321429569274187, + "step": 20 + }, + { + "completion_length": 1694.6116943359375, + "epoch": 0.006272869837950863, + "grad_norm": 0.03624781221151352, + "kl": 4.0411949157714844e-05, + "learning_rate": 6.26865671641791e-08, + "loss": 0.0017, + "reward": 0.13392857927829027, + "reward_std": 0.07979018054902554, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.013392857974395156, + "step": 21 + }, + { + "completion_length": 1719.3572082519531, + "epoch": 0.006571577925472332, + "grad_norm": 0.04365534335374832, + "kl": 3.7670135498046875e-05, + "learning_rate": 6.567164179104477e-08, + "loss": 0.0198, + "reward": 0.07366071827709675, + "reward_std": 0.08362098969519138, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.004464285913854837, + "step": 22 + }, + { + "completion_length": 1752.2032165527344, + "epoch": 0.006870286012993802, + "grad_norm": 0.056473713368177414, + "kl": 3.597140312194824e-05, + "learning_rate": 6.865671641791045e-08, + "loss": 0.0284, + "reward": 0.12723214784637094, + "reward_std": 0.12197329569607973, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.006696428870782256, + "step": 23 + }, + { + "completion_length": 1616.08935546875, + "epoch": 0.007168994100515271, + "grad_norm": 0.0587487667798996, + "kl": 3.2454729080200195e-05, + "learning_rate": 7.164179104477612e-08, + "loss": 0.0324, + "reward": 0.2254464440047741, + "reward_std": 0.16586939245462418, + "rewards/accuracy_reward": 0.2209821604192257, + "rewards/format_reward": 0.004464285913854837, + "step": 24 + }, + { + "completion_length": 1769.8148193359375, + "epoch": 0.007467702188036741, + "grad_norm": 0.051810018718242645, + "kl": 3.230571746826172e-05, + "learning_rate": 7.462686567164178e-08, + "loss": 0.0282, + "reward": 0.1116071492433548, + "reward_std": 0.12399674393236637, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "step": 25 + }, + { + "completion_length": 1705.2790832519531, + "epoch": 0.00776641027555821, + "grad_norm": 0.04350828751921654, + "kl": 3.483891487121582e-05, + "learning_rate": 7.761194029850746e-08, + "loss": 0.019, + "reward": 0.11383929196745157, + "reward_std": 0.12160852737724781, + "rewards/accuracy_reward": 0.11383929196745157, + "rewards/format_reward": 0.0, + "step": 26 + }, + { + "completion_length": 1698.8527526855469, + "epoch": 0.00806511836307968, + "grad_norm": 0.06312357634305954, + "kl": 5.257129669189453e-05, + "learning_rate": 8.059701492537313e-08, + "loss": 0.0396, + "reward": 0.2254464386496693, + "reward_std": 0.12023318652063608, + "rewards/accuracy_reward": 0.20982143632136285, + "rewards/format_reward": 0.01562500069849193, + "step": 27 + }, + { + "completion_length": 1674.8282165527344, + "epoch": 0.00836382645060115, + "grad_norm": 0.03965715691447258, + "kl": 3.316998481750488e-05, + "learning_rate": 8.35820895522388e-08, + "loss": 0.0179, + "reward": 0.09598214644938707, + "reward_std": 0.08370935916900635, + "rewards/accuracy_reward": 0.08705357508733869, + "rewards/format_reward": 0.008928571827709675, + "step": 28 + }, + { + "completion_length": 1606.9063110351562, + "epoch": 0.008662534538122619, + "grad_norm": 0.06423179805278778, + "kl": 3.93986701965332e-05, + "learning_rate": 8.656716417910448e-08, + "loss": 0.0436, + "reward": 0.09151786239817739, + "reward_std": 0.14748516026884317, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.022321429569274187, + "step": 29 + }, + { + "completion_length": 1694.8683471679688, + "epoch": 0.00896124262564409, + "grad_norm": 0.05486081540584564, + "kl": 3.5315752029418945e-05, + "learning_rate": 8.955223880597014e-08, + "loss": 0.0255, + "reward": 0.1941964365541935, + "reward_std": 0.15642656851559877, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.004464285913854837, + "step": 30 + }, + { + "completion_length": 1575.9487609863281, + "epoch": 0.009259950713165559, + "grad_norm": 0.05592089146375656, + "kl": 5.942583084106445e-05, + "learning_rate": 9.253731343283581e-08, + "loss": 0.0249, + "reward": 0.08258928917348385, + "reward_std": 0.10515020601451397, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.022321429569274187, + "step": 31 + }, + { + "completion_length": 1702.3304443359375, + "epoch": 0.009558658800687028, + "grad_norm": 0.04740530252456665, + "kl": 4.106760025024414e-05, + "learning_rate": 9.55223880597015e-08, + "loss": 0.013, + "reward": 0.1450892947614193, + "reward_std": 0.10071961395442486, + "rewards/accuracy_reward": 0.1383928693830967, + "rewards/format_reward": 0.0066964291036129, + "step": 32 + }, + { + "completion_length": 1746.7723693847656, + "epoch": 0.009857366888208497, + "grad_norm": 0.04644298925995827, + "kl": 3.775954246520996e-05, + "learning_rate": 9.850746268656717e-08, + "loss": 0.0268, + "reward": 0.09821429150179029, + "reward_std": 0.09249581396579742, + "rewards/accuracy_reward": 0.08928571944124997, + "rewards/format_reward": 0.008928571827709675, + "step": 33 + }, + { + "completion_length": 1706.0112609863281, + "epoch": 0.010156074975729968, + "grad_norm": 0.051091309636831284, + "kl": 3.230571746826172e-05, + "learning_rate": 1.0149253731343282e-07, + "loss": 0.0224, + "reward": 0.129464291036129, + "reward_std": 0.11575703509151936, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.008928571827709675, + "step": 34 + }, + { + "completion_length": 1681.0402526855469, + "epoch": 0.010454783063251438, + "grad_norm": 0.056992363184690475, + "kl": 3.3915042877197266e-05, + "learning_rate": 1.044776119402985e-07, + "loss": 0.0288, + "reward": 0.1272321529686451, + "reward_std": 0.15795506723225117, + "rewards/accuracy_reward": 0.10937500931322575, + "rewards/format_reward": 0.01785714365541935, + "step": 35 + }, + { + "completion_length": 1679.3750610351562, + "epoch": 0.010753491150772907, + "grad_norm": 0.04613526538014412, + "kl": 4.3511390686035156e-05, + "learning_rate": 1.0746268656716417e-07, + "loss": 0.0112, + "reward": 0.06473214668221772, + "reward_std": 0.09195108991116285, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.004464285913854837, + "step": 36 + }, + { + "completion_length": 1676.1027526855469, + "epoch": 0.011052199238294378, + "grad_norm": 0.06323786824941635, + "kl": 3.081560134887695e-05, + "learning_rate": 1.1044776119402985e-07, + "loss": 0.0324, + "reward": 0.1272321455180645, + "reward_std": 0.17131321132183075, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.008928571827709675, + "step": 37 + }, + { + "completion_length": 1728.9576416015625, + "epoch": 0.011350907325815847, + "grad_norm": 0.03247479721903801, + "kl": 4.1425228118896484e-05, + "learning_rate": 1.1343283582089553e-07, + "loss": 0.0137, + "reward": 0.04910714412108064, + "reward_std": 0.04791303817182779, + "rewards/accuracy_reward": 0.044642857974395156, + "rewards/format_reward": 0.004464285913854837, + "step": 38 + }, + { + "completion_length": 1696.5023193359375, + "epoch": 0.011649615413337316, + "grad_norm": 0.06918708235025406, + "kl": 4.607439041137695e-05, + "learning_rate": 1.1641791044776119e-07, + "loss": 0.044, + "reward": 0.08482143469154835, + "reward_std": 0.14885824546217918, + "rewards/accuracy_reward": 0.08482143469154835, + "rewards/format_reward": 0.0, + "step": 39 + }, + { + "completion_length": 1545.1607971191406, + "epoch": 0.011948323500858785, + "grad_norm": 0.06363609433174133, + "kl": 3.400444984436035e-05, + "learning_rate": 1.1940298507462686e-07, + "loss": 0.0265, + "reward": 0.21205358137376606, + "reward_std": 0.16945129726082087, + "rewards/accuracy_reward": 0.19866072316654027, + "rewards/format_reward": 0.013392857741564512, + "step": 40 + }, + { + "completion_length": 1710.0648193359375, + "epoch": 0.012247031588380256, + "grad_norm": 0.04683378338813782, + "kl": 4.124641418457031e-05, + "learning_rate": 1.2238805970149254e-07, + "loss": 0.0301, + "reward": 0.18080357648432255, + "reward_std": 0.13278185948729515, + "rewards/accuracy_reward": 0.17187500558793545, + "rewards/format_reward": 0.008928572060540318, + "step": 41 + }, + { + "completion_length": 1777.6630249023438, + "epoch": 0.012545739675901725, + "grad_norm": 0.04300553724169731, + "kl": 4.273653030395508e-05, + "learning_rate": 1.253731343283582e-07, + "loss": 0.0165, + "reward": 0.07142857671715319, + "reward_std": 0.08060805406421423, + "rewards/accuracy_reward": 0.06473214528523386, + "rewards/format_reward": 0.006696428870782256, + "step": 42 + }, + { + "completion_length": 1735.8438415527344, + "epoch": 0.012844447763423195, + "grad_norm": 0.05505981668829918, + "kl": 3.49581241607666e-05, + "learning_rate": 1.2835820895522386e-07, + "loss": 0.0347, + "reward": 0.12053571850992739, + "reward_std": 0.16318729985505342, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.01562500069849193, + "step": 43 + }, + { + "completion_length": 1669.15185546875, + "epoch": 0.013143155850944664, + "grad_norm": 0.05982421711087227, + "kl": 3.325939178466797e-05, + "learning_rate": 1.3134328358208955e-07, + "loss": 0.0283, + "reward": 0.13839286472648382, + "reward_std": 0.1492392122745514, + "rewards/accuracy_reward": 0.13616072293370962, + "rewards/format_reward": 0.0022321429569274187, + "step": 44 + }, + { + "completion_length": 1714.1228332519531, + "epoch": 0.013441863938466135, + "grad_norm": 0.04766504839062691, + "kl": 3.394484519958496e-05, + "learning_rate": 1.343283582089552e-07, + "loss": 0.0336, + "reward": 0.0959821455180645, + "reward_std": 0.10722821950912476, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.004464285913854837, + "step": 45 + }, + { + "completion_length": 1779.12060546875, + "epoch": 0.013740572025987604, + "grad_norm": 0.050136856734752655, + "kl": 4.035234451293945e-05, + "learning_rate": 1.373134328358209e-07, + "loss": 0.0246, + "reward": 0.15178572200238705, + "reward_std": 0.13453482650220394, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.004464285913854837, + "step": 46 + }, + { + "completion_length": 1678.9688110351562, + "epoch": 0.014039280113509073, + "grad_norm": 0.039608608931303024, + "kl": 3.784894943237305e-05, + "learning_rate": 1.4029850746268658e-07, + "loss": 0.0104, + "reward": 0.12946429220028222, + "reward_std": 0.09296693932265043, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.011160714784637094, + "step": 47 + }, + { + "completion_length": 1801.1786499023438, + "epoch": 0.014337988201030542, + "grad_norm": 0.04509079456329346, + "kl": 4.0590763092041016e-05, + "learning_rate": 1.4328358208955223e-07, + "loss": 0.0114, + "reward": 0.06026785937137902, + "reward_std": 0.07993951719254255, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.008928571827709675, + "step": 48 + }, + { + "completion_length": 1795.68310546875, + "epoch": 0.014636696288552013, + "grad_norm": 0.06266029179096222, + "kl": 3.1828880310058594e-05, + "learning_rate": 1.4626865671641792e-07, + "loss": 0.0364, + "reward": 0.11160715017467737, + "reward_std": 0.14242512360215187, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.013392857974395156, + "step": 49 + }, + { + "completion_length": 1673.6250610351562, + "epoch": 0.014935404376073482, + "grad_norm": 0.04995095729827881, + "kl": 2.5331974029541016e-05, + "learning_rate": 1.4925373134328355e-07, + "loss": 0.0249, + "reward": 0.1227678619325161, + "reward_std": 0.14626564271748066, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.01116071455180645, + "step": 50 + }, + { + "completion_length": 1658.4330749511719, + "epoch": 0.015234112463594952, + "grad_norm": 0.03937812149524689, + "kl": 3.93986701965332e-05, + "learning_rate": 1.5223880597014924e-07, + "loss": 0.0134, + "reward": 0.1785714365541935, + "reward_std": 0.08613727707415819, + "rewards/accuracy_reward": 0.17187500558793545, + "rewards/format_reward": 0.006696428870782256, + "step": 51 + }, + { + "completion_length": 1730.40185546875, + "epoch": 0.01553282055111642, + "grad_norm": 0.04167767986655235, + "kl": 2.5838613510131836e-05, + "learning_rate": 1.5522388059701492e-07, + "loss": 0.0032, + "reward": 0.07589286053553224, + "reward_std": 0.08896646834909916, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0022321429569274187, + "step": 52 + }, + { + "completion_length": 1738.2857666015625, + "epoch": 0.01583152863863789, + "grad_norm": 0.05946704372763634, + "kl": 4.369020462036133e-05, + "learning_rate": 1.5820895522388058e-07, + "loss": 0.0284, + "reward": 0.1584821492433548, + "reward_std": 0.14174646511673927, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.006696428870782256, + "step": 53 + }, + { + "completion_length": 1713.8973693847656, + "epoch": 0.01613023672615936, + "grad_norm": 0.045064933598041534, + "kl": 3.0547380447387695e-05, + "learning_rate": 1.6119402985074627e-07, + "loss": 0.0082, + "reward": 0.13616071827709675, + "reward_std": 0.06878311559557915, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0022321429569274187, + "step": 54 + }, + { + "completion_length": 1771.1964721679688, + "epoch": 0.016428944813680832, + "grad_norm": 0.0394134521484375, + "kl": 3.314018249511719e-05, + "learning_rate": 1.6417910447761193e-07, + "loss": 0.0143, + "reward": 0.05580357322469354, + "reward_std": 0.09047362674027681, + "rewards/accuracy_reward": 0.042410716880112886, + "rewards/format_reward": 0.013392857741564512, + "step": 55 + }, + { + "completion_length": 1707.35498046875, + "epoch": 0.0167276529012023, + "grad_norm": 0.05312688648700714, + "kl": 3.489851951599121e-05, + "learning_rate": 1.671641791044776e-07, + "loss": 0.0307, + "reward": 0.055803573690354824, + "reward_std": 0.11445087008178234, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.013392857974395156, + "step": 56 + }, + { + "completion_length": 1655.8304443359375, + "epoch": 0.01702636098872377, + "grad_norm": 0.04495803639292717, + "kl": 2.2113323211669922e-05, + "learning_rate": 1.701492537313433e-07, + "loss": 0.0252, + "reward": 0.0736607164144516, + "reward_std": 0.1016073040664196, + "rewards/accuracy_reward": 0.05803571827709675, + "rewards/format_reward": 0.015625000465661287, + "step": 57 + }, + { + "completion_length": 1689.0536193847656, + "epoch": 0.017325069076245238, + "grad_norm": 0.07067320495843887, + "kl": 3.3020973205566406e-05, + "learning_rate": 1.7313432835820896e-07, + "loss": 0.0426, + "reward": 0.20089286658912897, + "reward_std": 0.18864312395453453, + "rewards/accuracy_reward": 0.19196428963914514, + "rewards/format_reward": 0.008928571827709675, + "step": 58 + }, + { + "completion_length": 1630.1585388183594, + "epoch": 0.01762377716376671, + "grad_norm": 0.04939524829387665, + "kl": 3.007054328918457e-05, + "learning_rate": 1.7611940298507461e-07, + "loss": 0.0286, + "reward": 0.12276786612346768, + "reward_std": 0.10851243231445551, + "rewards/accuracy_reward": 0.11160714970901608, + "rewards/format_reward": 0.01116071455180645, + "step": 59 + }, + { + "completion_length": 1770.3772888183594, + "epoch": 0.01792248525128818, + "grad_norm": 0.054393164813518524, + "kl": 2.962350845336914e-05, + "learning_rate": 1.7910447761194027e-07, + "loss": 0.0342, + "reward": 0.1361607238650322, + "reward_std": 0.1515438873320818, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.015625000931322575, + "step": 60 + }, + { + "completion_length": 1685.3348693847656, + "epoch": 0.018221193338809647, + "grad_norm": 0.05150194838643074, + "kl": 3.695487976074219e-05, + "learning_rate": 1.8208955223880596e-07, + "loss": 0.0243, + "reward": 0.12053571827709675, + "reward_std": 0.127948347479105, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.017857144121080637, + "step": 61 + }, + { + "completion_length": 1563.8036499023438, + "epoch": 0.018519901426331118, + "grad_norm": 0.05364827439188957, + "kl": 2.586841583251953e-05, + "learning_rate": 1.8507462686567162e-07, + "loss": 0.0223, + "reward": 0.06026786100119352, + "reward_std": 0.10426824167370796, + "rewards/accuracy_reward": 0.055803573690354824, + "rewards/format_reward": 0.004464285913854837, + "step": 62 + }, + { + "completion_length": 1657.9286193847656, + "epoch": 0.01881860951385259, + "grad_norm": 0.049354057759046555, + "kl": 2.22623348236084e-05, + "learning_rate": 1.880597014925373e-07, + "loss": 0.0167, + "reward": 0.053571431897580624, + "reward_std": 0.1256946064531803, + "rewards/accuracy_reward": 0.04017857229337096, + "rewards/format_reward": 0.013392857741564512, + "step": 63 + }, + { + "completion_length": 1552.8661499023438, + "epoch": 0.019117317601374056, + "grad_norm": 0.057510435581207275, + "kl": 3.17990779876709e-05, + "learning_rate": 1.91044776119403e-07, + "loss": 0.02, + "reward": 0.2053571492433548, + "reward_std": 0.14952366519719362, + "rewards/accuracy_reward": 0.191964291036129, + "rewards/format_reward": 0.0133928582072258, + "step": 64 + }, + { + "completion_length": 1759.0558776855469, + "epoch": 0.019416025688895527, + "grad_norm": 0.06367983669042587, + "kl": 2.714991569519043e-05, + "learning_rate": 1.9402985074626865e-07, + "loss": 0.0421, + "reward": 0.07589286053553224, + "reward_std": 0.13627580367028713, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.013392857974395156, + "step": 65 + }, + { + "completion_length": 1822.5826721191406, + "epoch": 0.019714733776416995, + "grad_norm": 0.036766167730093, + "kl": 2.917647361755371e-05, + "learning_rate": 1.9701492537313433e-07, + "loss": 0.0122, + "reward": 0.09151786146685481, + "reward_std": 0.09155263099819422, + "rewards/accuracy_reward": 0.09151786146685481, + "rewards/format_reward": 0.0, + "step": 66 + }, + { + "completion_length": 1820.2835388183594, + "epoch": 0.020013441863938466, + "grad_norm": 0.039648186415433884, + "kl": 1.6629695892333984e-05, + "learning_rate": 2e-07, + "loss": 0.0186, + "reward": 0.10267857555299997, + "reward_std": 0.12379970215260983, + "rewards/accuracy_reward": 0.09821429010480642, + "rewards/format_reward": 0.004464285913854837, + "step": 67 + }, + { + "completion_length": 1594.7322082519531, + "epoch": 0.020312149951459937, + "grad_norm": 0.0487375482916832, + "kl": 1.582503318786621e-05, + "learning_rate": 2.0298507462686565e-07, + "loss": 0.0084, + "reward": 0.1584821529686451, + "reward_std": 0.10763046331703663, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.0, + "step": 68 + }, + { + "completion_length": 1668.8572082519531, + "epoch": 0.020610858038981404, + "grad_norm": 0.04292088374495506, + "kl": 2.9206275939941406e-05, + "learning_rate": 2.0597014925373134e-07, + "loss": 0.0122, + "reward": 0.08258928591385484, + "reward_std": 0.10053416062146425, + "rewards/accuracy_reward": 0.07812500046566129, + "rewards/format_reward": 0.004464285913854837, + "step": 69 + }, + { + "completion_length": 1677.4955749511719, + "epoch": 0.020909566126502875, + "grad_norm": 0.07468431442975998, + "kl": 1.7642974853515625e-05, + "learning_rate": 2.08955223880597e-07, + "loss": 0.045, + "reward": 0.13839286379516125, + "reward_std": 0.20624065399169922, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.006696428870782256, + "step": 70 + }, + { + "completion_length": 1558.6518249511719, + "epoch": 0.021208274214024346, + "grad_norm": 0.07890792936086655, + "kl": 1.73225998878479e-05, + "learning_rate": 2.1194029850746268e-07, + "loss": 0.0475, + "reward": 0.1919642947614193, + "reward_std": 0.20236171782016754, + "rewards/accuracy_reward": 0.18526786752045155, + "rewards/format_reward": 0.006696428870782256, + "step": 71 + }, + { + "completion_length": 1587.8415832519531, + "epoch": 0.021506982301545814, + "grad_norm": 0.042974092066287994, + "kl": 1.9624829292297363e-05, + "learning_rate": 2.1492537313432834e-07, + "loss": 0.0229, + "reward": 0.13839286286383867, + "reward_std": 0.08748415019363165, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.006696428870782256, + "step": 72 + }, + { + "completion_length": 1570.07373046875, + "epoch": 0.021805690389067284, + "grad_norm": 0.06709501147270203, + "kl": 2.3797154426574707e-05, + "learning_rate": 2.1791044776119402e-07, + "loss": 0.0228, + "reward": 0.2433035895228386, + "reward_std": 0.17911841347813606, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.022321430034935474, + "step": 73 + }, + { + "completion_length": 1661.41748046875, + "epoch": 0.022104398476588755, + "grad_norm": 0.050798214972019196, + "kl": 1.553446054458618e-05, + "learning_rate": 2.208955223880597e-07, + "loss": 0.0237, + "reward": 0.05133928847499192, + "reward_std": 0.10677819419652224, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.006696428870782256, + "step": 74 + }, + { + "completion_length": 1655.4866638183594, + "epoch": 0.022403106564110223, + "grad_norm": 0.052087631076574326, + "kl": 1.671910285949707e-05, + "learning_rate": 2.2388059701492537e-07, + "loss": 0.0178, + "reward": 0.09821429220028222, + "reward_std": 0.11808110307902098, + "rewards/accuracy_reward": 0.08928572130389512, + "rewards/format_reward": 0.008928572060540318, + "step": 75 + }, + { + "completion_length": 1663.7611999511719, + "epoch": 0.022701814651631694, + "grad_norm": 0.055990323424339294, + "kl": 1.858919858932495e-05, + "learning_rate": 2.2686567164179105e-07, + "loss": 0.006, + "reward": 0.08705357694998384, + "reward_std": 0.09626706037670374, + "rewards/accuracy_reward": 0.08035714598372579, + "rewards/format_reward": 0.006696428870782256, + "step": 76 + }, + { + "completion_length": 1712.1429443359375, + "epoch": 0.02300052273915316, + "grad_norm": 0.04551730677485466, + "kl": 1.689046621322632e-05, + "learning_rate": 2.2985074626865669e-07, + "loss": 0.0268, + "reward": 0.1071428619325161, + "reward_std": 0.08477053046226501, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.004464285913854837, + "step": 77 + }, + { + "completion_length": 1826.6719665527344, + "epoch": 0.023299230826674632, + "grad_norm": 0.040820423513650894, + "kl": 1.3976008631289005e-05, + "learning_rate": 2.3283582089552237e-07, + "loss": 0.0201, + "reward": 0.14955358067527413, + "reward_std": 0.09650375600904226, + "rewards/accuracy_reward": 0.14285715413279831, + "rewards/format_reward": 0.006696428870782256, + "step": 78 + }, + { + "completion_length": 1841.41748046875, + "epoch": 0.023597938914196103, + "grad_norm": 0.023537449538707733, + "kl": 1.3768672943115234e-05, + "learning_rate": 2.3582089552238803e-07, + "loss": 0.0063, + "reward": 0.058035716181620955, + "reward_std": 0.04508384224027395, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.0022321429569274187, + "step": 79 + }, + { + "completion_length": 1645.3237609863281, + "epoch": 0.02389664700171757, + "grad_norm": 0.05437898635864258, + "kl": 1.0658055543899536e-05, + "learning_rate": 2.388059701492537e-07, + "loss": 0.027, + "reward": 0.1026785746216774, + "reward_std": 0.11483267601579428, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.004464285913854837, + "step": 80 + }, + { + "completion_length": 1723.7746276855469, + "epoch": 0.02419535508923904, + "grad_norm": 0.04588991403579712, + "kl": 1.1682510375976562e-05, + "learning_rate": 2.417910447761194e-07, + "loss": 0.0164, + "reward": 0.1316964328289032, + "reward_std": 0.09871610999107361, + "rewards/accuracy_reward": 0.1116071455180645, + "rewards/format_reward": 0.020089286379516125, + "step": 81 + }, + { + "completion_length": 1733.1942749023438, + "epoch": 0.024494063176760512, + "grad_norm": 0.051920246332883835, + "kl": 1.8164515495300293e-05, + "learning_rate": 2.447761194029851e-07, + "loss": 0.0276, + "reward": 0.07366071688011289, + "reward_std": 0.12923650722950697, + "rewards/accuracy_reward": 0.06919643329456449, + "rewards/format_reward": 0.004464285913854837, + "step": 82 + }, + { + "completion_length": 1640.77685546875, + "epoch": 0.02479277126428198, + "grad_norm": 0.04617319628596306, + "kl": 1.7255544662475586e-05, + "learning_rate": 2.4776119402985074e-07, + "loss": 0.0134, + "reward": 0.05133928847499192, + "reward_std": 0.09008043352514505, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.008928571827709675, + "step": 83 + }, + { + "completion_length": 1645.4486999511719, + "epoch": 0.02509147935180345, + "grad_norm": 0.07751814275979996, + "kl": 1.6011297702789307e-05, + "learning_rate": 2.507462686567164e-07, + "loss": 0.0371, + "reward": 0.12723214738070965, + "reward_std": 0.171880841255188, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0133928582072258, + "step": 84 + }, + { + "completion_length": 1716.6139221191406, + "epoch": 0.02539018743932492, + "grad_norm": 0.054927077144384384, + "kl": 1.0408461093902588e-05, + "learning_rate": 2.537313432835821e-07, + "loss": 0.0315, + "reward": 0.20089286682195961, + "reward_std": 0.14652621280401945, + "rewards/accuracy_reward": 0.19419643399305642, + "rewards/format_reward": 0.006696428870782256, + "step": 85 + }, + { + "completion_length": 1744.4866943359375, + "epoch": 0.02568889552684639, + "grad_norm": 0.04859442636370659, + "kl": 2.1159648895263672e-05, + "learning_rate": 2.567164179104477e-07, + "loss": 0.016, + "reward": 0.1250000074505806, + "reward_std": 0.11255832016468048, + "rewards/accuracy_reward": 0.12053572200238705, + "rewards/format_reward": 0.004464285913854837, + "step": 86 + }, + { + "completion_length": 1653.9933776855469, + "epoch": 0.02598760361436786, + "grad_norm": 0.05586105212569237, + "kl": 2.2143125534057617e-05, + "learning_rate": 2.5970149253731343e-07, + "loss": 0.0244, + "reward": 0.20312501210719347, + "reward_std": 0.1552404835820198, + "rewards/accuracy_reward": 0.19419643748551607, + "rewards/format_reward": 0.008928571827709675, + "step": 87 + }, + { + "completion_length": 1788.7098999023438, + "epoch": 0.026286311701889328, + "grad_norm": 0.04623936861753464, + "kl": 3.3408403396606445e-05, + "learning_rate": 2.626865671641791e-07, + "loss": 0.0184, + "reward": 0.0357142873108387, + "reward_std": 0.10003227926790714, + "rewards/accuracy_reward": 0.029017858440056443, + "rewards/format_reward": 0.006696428870782256, + "step": 88 + }, + { + "completion_length": 1658.8750915527344, + "epoch": 0.0265850197894108, + "grad_norm": 0.057209454476833344, + "kl": 3.549456596374512e-05, + "learning_rate": 2.656716417910448e-07, + "loss": 0.0314, + "reward": 0.2187500111758709, + "reward_std": 0.11525190807878971, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.0022321429569274187, + "step": 89 + }, + { + "completion_length": 1782.3683776855469, + "epoch": 0.02688372787693227, + "grad_norm": 0.04070289433002472, + "kl": 5.161762237548828e-05, + "learning_rate": 2.686567164179104e-07, + "loss": 0.0202, + "reward": 0.0714285746216774, + "reward_std": 0.07644847221672535, + "rewards/accuracy_reward": 0.06473214784637094, + "rewards/format_reward": 0.006696428870782256, + "step": 90 + }, + { + "completion_length": 1604.9911499023438, + "epoch": 0.027182435964453737, + "grad_norm": 0.062121082097291946, + "kl": 5.060434341430664e-05, + "learning_rate": 2.7164179104477607e-07, + "loss": 0.0235, + "reward": 0.06696428963914514, + "reward_std": 0.1317344531416893, + "rewards/accuracy_reward": 0.04017857299186289, + "rewards/format_reward": 0.026785715483129025, + "step": 91 + }, + { + "completion_length": 1579.0826721191406, + "epoch": 0.027481144051975208, + "grad_norm": 0.050243206322193146, + "kl": 5.537271499633789e-05, + "learning_rate": 2.746268656716418e-07, + "loss": 0.0232, + "reward": 0.12276786286383867, + "reward_std": 0.12700968142598867, + "rewards/accuracy_reward": 0.11160714412108064, + "rewards/format_reward": 0.011160714784637094, + "step": 92 + }, + { + "completion_length": 1636.8795776367188, + "epoch": 0.027779852139496675, + "grad_norm": 0.06345855444669724, + "kl": 5.692243576049805e-05, + "learning_rate": 2.7761194029850744e-07, + "loss": 0.0266, + "reward": 0.2165178582072258, + "reward_std": 0.20447038114070892, + "rewards/accuracy_reward": 0.2142857164144516, + "rewards/format_reward": 0.0022321429569274187, + "step": 93 + }, + { + "completion_length": 1752.94873046875, + "epoch": 0.028078560227018146, + "grad_norm": 0.04778573662042618, + "kl": 4.7206878662109375e-05, + "learning_rate": 2.8059701492537315e-07, + "loss": 0.0199, + "reward": 0.0691964328289032, + "reward_std": 0.12900490500032902, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0022321429569274187, + "step": 94 + }, + { + "completion_length": 1762.85498046875, + "epoch": 0.028377268314539617, + "grad_norm": 0.05785205215215683, + "kl": 6.777048110961914e-05, + "learning_rate": 2.8358208955223876e-07, + "loss": 0.0301, + "reward": 0.098214291036129, + "reward_std": 0.0972639862447977, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.01562500116415322, + "step": 95 + }, + { + "completion_length": 1696.8907165527344, + "epoch": 0.028675976402061085, + "grad_norm": 0.06844736635684967, + "kl": 7.015466690063477e-05, + "learning_rate": 2.8656716417910447e-07, + "loss": 0.027, + "reward": 0.12500000605359674, + "reward_std": 0.1552263293415308, + "rewards/accuracy_reward": 0.10044643515720963, + "rewards/format_reward": 0.024553572526201606, + "step": 96 + }, + { + "completion_length": 1811.9688720703125, + "epoch": 0.028974684489582556, + "grad_norm": 0.03674961254000664, + "kl": 7.522106170654297e-05, + "learning_rate": 2.8955223880597013e-07, + "loss": 0.0065, + "reward": 0.04241071571595967, + "reward_std": 0.08657825831323862, + "rewards/accuracy_reward": 0.04017857392318547, + "rewards/format_reward": 0.0022321429569274187, + "step": 97 + }, + { + "completion_length": 1759.6295776367188, + "epoch": 0.029273392577104027, + "grad_norm": 0.06964971125125885, + "kl": 9.846687316894531e-05, + "learning_rate": 2.9253731343283584e-07, + "loss": 0.0372, + "reward": 0.22991071874275804, + "reward_std": 0.14032442588359118, + "rewards/accuracy_reward": 0.22321429336443543, + "rewards/format_reward": 0.006696428870782256, + "step": 98 + }, + { + "completion_length": 1558.1808471679688, + "epoch": 0.029572100664625494, + "grad_norm": 0.04793652519583702, + "kl": 0.00011479854583740234, + "learning_rate": 2.955223880597015e-07, + "loss": 0.0199, + "reward": 0.0848214328289032, + "reward_std": 0.12375291809439659, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.013392857974395156, + "step": 99 + }, + { + "completion_length": 1729.2835693359375, + "epoch": 0.029870808752146965, + "grad_norm": 0.06034554913640022, + "kl": 0.00012183189392089844, + "learning_rate": 2.985074626865671e-07, + "loss": 0.0168, + "reward": 0.11607143329456449, + "reward_std": 0.15944681502878666, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.004464285913854837, + "step": 100 + }, + { + "completion_length": 1681.5736999511719, + "epoch": 0.030169516839668432, + "grad_norm": 0.057237885892391205, + "kl": 8.004903793334961e-05, + "learning_rate": 3.014925373134328e-07, + "loss": 0.0372, + "reward": 0.15178571757860482, + "reward_std": 0.1496741333976388, + "rewards/accuracy_reward": 0.1473214328289032, + "rewards/format_reward": 0.004464285913854837, + "step": 101 + }, + { + "completion_length": 1744.8326721191406, + "epoch": 0.030468224927189903, + "grad_norm": 0.04614361375570297, + "kl": 0.00010061264038085938, + "learning_rate": 3.044776119402985e-07, + "loss": 0.0256, + "reward": 0.10714286472648382, + "reward_std": 0.09437388740479946, + "rewards/accuracy_reward": 0.08928571711294353, + "rewards/format_reward": 0.017857144121080637, + "step": 102 + }, + { + "completion_length": 1629.1830749511719, + "epoch": 0.030766933014711374, + "grad_norm": 0.07557614892721176, + "kl": 0.0001558065414428711, + "learning_rate": 3.074626865671642e-07, + "loss": 0.0424, + "reward": 0.1696428619325161, + "reward_std": 0.1629813089966774, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.008928572060540318, + "step": 103 + }, + { + "completion_length": 1619.0670166015625, + "epoch": 0.03106564110223284, + "grad_norm": 0.07275954633951187, + "kl": 0.00013840198516845703, + "learning_rate": 3.1044776119402985e-07, + "loss": 0.0318, + "reward": 0.18080358020961285, + "reward_std": 0.17114954814314842, + "rewards/accuracy_reward": 0.16964286053553224, + "rewards/format_reward": 0.011160715017467737, + "step": 104 + }, + { + "completion_length": 1843.3996276855469, + "epoch": 0.03136434918975431, + "grad_norm": 0.03301394730806351, + "kl": 0.00014710426330566406, + "learning_rate": 3.134328358208955e-07, + "loss": 0.0102, + "reward": 0.1116071492433548, + "reward_std": 0.07940683979541063, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "step": 105 + }, + { + "completion_length": 1600.6719360351562, + "epoch": 0.03166305727727578, + "grad_norm": 0.07341071218252182, + "kl": 0.0001430511474609375, + "learning_rate": 3.1641791044776116e-07, + "loss": 0.054, + "reward": 0.13392857648432255, + "reward_std": 0.180702842772007, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.011160714784637094, + "step": 106 + }, + { + "completion_length": 1672.7210388183594, + "epoch": 0.03196176536479725, + "grad_norm": 0.05614521726965904, + "kl": 0.00018262863159179688, + "learning_rate": 3.194029850746269e-07, + "loss": 0.0204, + "reward": 0.08482143399305642, + "reward_std": 0.10669602360576391, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.01562500116415322, + "step": 107 + }, + { + "completion_length": 1751.6384582519531, + "epoch": 0.03226047345231872, + "grad_norm": 0.05019238591194153, + "kl": 0.00018835067749023438, + "learning_rate": 3.2238805970149253e-07, + "loss": 0.0317, + "reward": 0.08482143422588706, + "reward_std": 0.12819715775549412, + "rewards/accuracy_reward": 0.06919643399305642, + "rewards/format_reward": 0.015625000465661287, + "step": 108 + }, + { + "completion_length": 1717.919677734375, + "epoch": 0.03255918153984019, + "grad_norm": 0.06629742681980133, + "kl": 0.00022530555725097656, + "learning_rate": 3.253731343283582e-07, + "loss": 0.0354, + "reward": 0.18303572619333863, + "reward_std": 0.15841143764555454, + "rewards/accuracy_reward": 0.17633929592557251, + "rewards/format_reward": 0.006696428870782256, + "step": 109 + }, + { + "completion_length": 1650.8951721191406, + "epoch": 0.032857889627361664, + "grad_norm": 0.051004454493522644, + "kl": 0.0003151893615722656, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.0218, + "reward": 0.05357143119908869, + "reward_std": 0.1040569031611085, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.006696428870782256, + "step": 110 + }, + { + "completion_length": 1764.7232666015625, + "epoch": 0.03315659771488313, + "grad_norm": 0.04392259940505028, + "kl": 0.00029277801513671875, + "learning_rate": 3.313432835820895e-07, + "loss": 0.0143, + "reward": 0.06473214388825, + "reward_std": 0.10251108650118113, + "rewards/accuracy_reward": 0.05580357206054032, + "rewards/format_reward": 0.008928572060540318, + "step": 111 + }, + { + "completion_length": 1730.7366943359375, + "epoch": 0.0334553058024046, + "grad_norm": 0.0540962778031826, + "kl": 0.00045299530029296875, + "learning_rate": 3.343283582089552e-07, + "loss": 0.0279, + "reward": 0.08482143213041127, + "reward_std": 0.11304582748562098, + "rewards/accuracy_reward": 0.06473214528523386, + "rewards/format_reward": 0.02008928661234677, + "step": 112 + }, + { + "completion_length": 1721.5536499023438, + "epoch": 0.03375401388992607, + "grad_norm": 0.0517786480486393, + "kl": 0.00042629241943359375, + "learning_rate": 3.373134328358209e-07, + "loss": 0.0077, + "reward": 0.1517857201397419, + "reward_std": 0.09870189148932695, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.008928572060540318, + "step": 113 + }, + { + "completion_length": 1609.4420166015625, + "epoch": 0.03405272197744754, + "grad_norm": 0.05495617538690567, + "kl": 0.0003972053527832031, + "learning_rate": 3.402985074626866e-07, + "loss": 0.0386, + "reward": 0.1517857201397419, + "reward_std": 0.11822676379233599, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.029017858440056443, + "step": 114 + }, + { + "completion_length": 1874.305908203125, + "epoch": 0.03435143006496901, + "grad_norm": 0.036008477210998535, + "kl": 0.00045871734619140625, + "learning_rate": 3.432835820895522e-07, + "loss": 0.0127, + "reward": 0.06696428684517741, + "reward_std": 0.07666933722794056, + "rewards/accuracy_reward": 0.040178571827709675, + "rewards/format_reward": 0.026785715483129025, + "step": 115 + }, + { + "completion_length": 1690.3348999023438, + "epoch": 0.034650138152490476, + "grad_norm": 0.04117932915687561, + "kl": 0.0005507469177246094, + "learning_rate": 3.462686567164179e-07, + "loss": 0.0064, + "reward": 0.037946428870782256, + "reward_std": 0.06308563705533743, + "rewards/accuracy_reward": 0.035714287078008056, + "rewards/format_reward": 0.0022321429569274187, + "step": 116 + }, + { + "completion_length": 1766.2344665527344, + "epoch": 0.034948846240011947, + "grad_norm": 0.04783080518245697, + "kl": 0.0005679130554199219, + "learning_rate": 3.4925373134328357e-07, + "loss": 0.01, + "reward": 0.09821429196745157, + "reward_std": 0.05903974920511246, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.006696428870782256, + "step": 117 + }, + { + "completion_length": 1673.3929138183594, + "epoch": 0.03524755432753342, + "grad_norm": 0.04764517769217491, + "kl": 0.00067138671875, + "learning_rate": 3.5223880597014923e-07, + "loss": 0.0232, + "reward": 0.2053571529686451, + "reward_std": 0.13444587402045727, + "rewards/accuracy_reward": 0.191964291036129, + "rewards/format_reward": 0.013392857741564512, + "step": 118 + }, + { + "completion_length": 1593.4933776855469, + "epoch": 0.03554626241505489, + "grad_norm": 0.061403609812259674, + "kl": 0.0006036758422851562, + "learning_rate": 3.552238805970149e-07, + "loss": 0.0289, + "reward": 0.14955357927829027, + "reward_std": 0.1719045452773571, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.011160714784637094, + "step": 119 + }, + { + "completion_length": 1790.5491943359375, + "epoch": 0.03584497050257636, + "grad_norm": 0.047276489436626434, + "kl": 0.0008068084716796875, + "learning_rate": 3.5820895522388055e-07, + "loss": 0.0192, + "reward": 0.08705357578583062, + "reward_std": 0.0752843888476491, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.013392857974395156, + "step": 120 + }, + { + "completion_length": 1575.5915832519531, + "epoch": 0.03614367859009783, + "grad_norm": 0.059127043932676315, + "kl": 0.0008525848388671875, + "learning_rate": 3.6119402985074626e-07, + "loss": 0.0117, + "reward": 0.10937500558793545, + "reward_std": 0.14155262149870396, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.011160715017467737, + "step": 121 + }, + { + "completion_length": 1659.6072387695312, + "epoch": 0.036442386677619294, + "grad_norm": 0.06098371744155884, + "kl": 0.0007610321044921875, + "learning_rate": 3.641791044776119e-07, + "loss": 0.04, + "reward": 0.07589286006987095, + "reward_std": 0.14114661514759064, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.008928571827709675, + "step": 122 + }, + { + "completion_length": 1660.43310546875, + "epoch": 0.036741094765140765, + "grad_norm": 0.04086422547698021, + "kl": 0.0007848739624023438, + "learning_rate": 3.6716417910447763e-07, + "loss": 0.0034, + "reward": 0.07812500465661287, + "reward_std": 0.08759764954447746, + "rewards/accuracy_reward": 0.07142857694998384, + "rewards/format_reward": 0.006696428870782256, + "step": 123 + }, + { + "completion_length": 1667.7634582519531, + "epoch": 0.037039802852662236, + "grad_norm": 0.05997523292899132, + "kl": 0.0010356903076171875, + "learning_rate": 3.7014925373134323e-07, + "loss": 0.0175, + "reward": 0.13169643469154835, + "reward_std": 0.145840622484684, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.008928572060540318, + "step": 124 + }, + { + "completion_length": 1606.571533203125, + "epoch": 0.03733851094018371, + "grad_norm": 0.05838323011994362, + "kl": 0.0009260177612304688, + "learning_rate": 3.7313432835820895e-07, + "loss": 0.0441, + "reward": 0.1919642984867096, + "reward_std": 0.14312764443457127, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.03571428661234677, + "step": 125 + }, + { + "completion_length": 1803.6139221191406, + "epoch": 0.03763721902770518, + "grad_norm": 0.06281829625368118, + "kl": 0.0011396408081054688, + "learning_rate": 3.761194029850746e-07, + "loss": 0.0366, + "reward": 0.212053582072258, + "reward_std": 0.12638495676219463, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.01785714295692742, + "step": 126 + }, + { + "completion_length": 1721.0782165527344, + "epoch": 0.03793592711522664, + "grad_norm": 0.057064976543188095, + "kl": 0.001148223876953125, + "learning_rate": 3.7910447761194026e-07, + "loss": 0.0227, + "reward": 0.18750000302679837, + "reward_std": 0.13548822421580553, + "rewards/accuracy_reward": 0.18080357764847577, + "rewards/format_reward": 0.006696428870782256, + "step": 127 + }, + { + "completion_length": 1618.2255249023438, + "epoch": 0.03823463520274811, + "grad_norm": 0.10370180755853653, + "kl": 0.0014028549194335938, + "learning_rate": 3.82089552238806e-07, + "loss": 0.0323, + "reward": 0.1428571529686451, + "reward_std": 0.14181423373520374, + "rewards/accuracy_reward": 0.10491072246804833, + "rewards/format_reward": 0.03794642933644354, + "step": 128 + }, + { + "completion_length": 1633.80810546875, + "epoch": 0.038533343290269584, + "grad_norm": 0.06281735748052597, + "kl": 0.001125335693359375, + "learning_rate": 3.850746268656716e-07, + "loss": 0.0322, + "reward": 0.2276785857975483, + "reward_std": 0.16544573940336704, + "rewards/accuracy_reward": 0.207589291036129, + "rewards/format_reward": 0.020089287078008056, + "step": 129 + }, + { + "completion_length": 1460.5603332519531, + "epoch": 0.038832051377791055, + "grad_norm": 0.07656513899564743, + "kl": 0.0015106201171875, + "learning_rate": 3.880597014925373e-07, + "loss": 0.0339, + "reward": 0.25000000931322575, + "reward_std": 0.23651868477463722, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.03571428777649999, + "step": 130 + }, + { + "completion_length": 1786.5647888183594, + "epoch": 0.039130759465312526, + "grad_norm": 0.06158895045518875, + "kl": 0.0013751983642578125, + "learning_rate": 3.9104477611940295e-07, + "loss": 0.0468, + "reward": 0.13169643469154835, + "reward_std": 0.18993025086820126, + "rewards/accuracy_reward": 0.11383929196745157, + "rewards/format_reward": 0.017857143888249993, + "step": 131 + }, + { + "completion_length": 1629.4933776855469, + "epoch": 0.03942946755283399, + "grad_norm": 0.07316899299621582, + "kl": 0.0012197494506835938, + "learning_rate": 3.9402985074626866e-07, + "loss": 0.0255, + "reward": 0.07142857555299997, + "reward_std": 0.17215274646878242, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.01785714365541935, + "step": 132 + }, + { + "completion_length": 1573.8750915527344, + "epoch": 0.03972817564035546, + "grad_norm": 0.06987937539815903, + "kl": 0.0014934539794921875, + "learning_rate": 3.970149253731343e-07, + "loss": 0.0488, + "reward": 0.2232142984867096, + "reward_std": 0.1739894226193428, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.04241071571595967, + "step": 133 + }, + { + "completion_length": 1596.82373046875, + "epoch": 0.04002688372787693, + "grad_norm": 0.05614149942994118, + "kl": 0.0015659332275390625, + "learning_rate": 4e-07, + "loss": 0.0232, + "reward": 0.12946429150179029, + "reward_std": 0.10380434710532427, + "rewards/accuracy_reward": 0.08705357648432255, + "rewards/format_reward": 0.042410716880112886, + "step": 134 + }, + { + "completion_length": 1665.0670471191406, + "epoch": 0.0403255918153984, + "grad_norm": 0.0757107362151146, + "kl": 0.001758575439453125, + "learning_rate": 4.0298507462686564e-07, + "loss": 0.0485, + "reward": 0.14732143376022577, + "reward_std": 0.15655561909079552, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.03571428754366934, + "step": 135 + }, + { + "completion_length": 1837.9197692871094, + "epoch": 0.04062429990291987, + "grad_norm": 0.050848670303821564, + "kl": 0.0018596649169921875, + "learning_rate": 4.059701492537313e-07, + "loss": 0.0191, + "reward": 0.06250000302679837, + "reward_std": 0.11217351537197828, + "rewards/accuracy_reward": 0.04910714388824999, + "rewards/format_reward": 0.013392857974395156, + "step": 136 + }, + { + "completion_length": 1732.5625915527344, + "epoch": 0.040923007990441344, + "grad_norm": 0.06324513256549835, + "kl": 0.00213623046875, + "learning_rate": 4.08955223880597e-07, + "loss": 0.0402, + "reward": 0.1227678619325161, + "reward_std": 0.16633044555783272, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.02232142980210483, + "step": 137 + }, + { + "completion_length": 1720.63623046875, + "epoch": 0.04122171607796281, + "grad_norm": 0.08746451884508133, + "kl": 0.001811981201171875, + "learning_rate": 4.1194029850746267e-07, + "loss": 0.057, + "reward": 0.12500000931322575, + "reward_std": 0.16908329725265503, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.026785715017467737, + "step": 138 + }, + { + "completion_length": 1674.6451416015625, + "epoch": 0.04152042416548428, + "grad_norm": 0.057066429406404495, + "kl": 0.0021648406982421875, + "learning_rate": 4.1492537313432833e-07, + "loss": 0.0189, + "reward": 0.267857164144516, + "reward_std": 0.12211152072995901, + "rewards/accuracy_reward": 0.2566964440047741, + "rewards/format_reward": 0.011160715017467737, + "step": 139 + }, + { + "completion_length": 1606.1496276855469, + "epoch": 0.04181913225300575, + "grad_norm": 0.08377691358327866, + "kl": 0.0017337799072265625, + "learning_rate": 4.17910447761194e-07, + "loss": 0.0476, + "reward": 0.1696428693830967, + "reward_std": 0.19661114364862442, + "rewards/accuracy_reward": 0.1272321529686451, + "rewards/format_reward": 0.042410717345774174, + "step": 140 + }, + { + "completion_length": 1690.0826721191406, + "epoch": 0.04211784034052722, + "grad_norm": 0.07963845133781433, + "kl": 0.002140045166015625, + "learning_rate": 4.208955223880597e-07, + "loss": 0.0413, + "reward": 0.1852678656578064, + "reward_std": 0.24697409942746162, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.049107146449387074, + "step": 141 + }, + { + "completion_length": 1512.8304138183594, + "epoch": 0.04241654842804869, + "grad_norm": 0.10564562678337097, + "kl": 0.00241851806640625, + "learning_rate": 4.2388059701492536e-07, + "loss": 0.0741, + "reward": 0.14955357578583062, + "reward_std": 0.21970851067453623, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.044642857974395156, + "step": 142 + }, + { + "completion_length": 1654.6072082519531, + "epoch": 0.042715256515570156, + "grad_norm": 0.07331626862287521, + "kl": 0.00269317626953125, + "learning_rate": 4.2686567164179107e-07, + "loss": 0.0403, + "reward": 0.12500000279396772, + "reward_std": 0.13788001984357834, + "rewards/accuracy_reward": 0.09375000488944352, + "rewards/format_reward": 0.031250001629814506, + "step": 143 + }, + { + "completion_length": 1718.0558776855469, + "epoch": 0.04301396460309163, + "grad_norm": 0.06172342225909233, + "kl": 0.002475738525390625, + "learning_rate": 4.298507462686567e-07, + "loss": 0.0249, + "reward": 0.08928571688011289, + "reward_std": 0.12176966853439808, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.03125000139698386, + "step": 144 + }, + { + "completion_length": 1508.8036499023438, + "epoch": 0.0433126726906131, + "grad_norm": 0.1012967899441719, + "kl": 0.00228118896484375, + "learning_rate": 4.3283582089552234e-07, + "loss": 0.0609, + "reward": 0.1629464328289032, + "reward_std": 0.22505563497543335, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.04910714481957257, + "step": 145 + }, + { + "completion_length": 1704.1719360351562, + "epoch": 0.04361138077813457, + "grad_norm": 0.07013235986232758, + "kl": 0.002712249755859375, + "learning_rate": 4.3582089552238805e-07, + "loss": 0.0168, + "reward": 0.09151786100119352, + "reward_std": 0.1398733425885439, + "rewards/accuracy_reward": 0.05803571757860482, + "rewards/format_reward": 0.033482144586741924, + "step": 146 + }, + { + "completion_length": 1633.7835388183594, + "epoch": 0.04391008886565604, + "grad_norm": 0.10275188833475113, + "kl": 0.003696441650390625, + "learning_rate": 4.388059701492537e-07, + "loss": 0.0467, + "reward": 0.2544643022119999, + "reward_std": 0.2288096584379673, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.03794642956927419, + "step": 147 + }, + { + "completion_length": 1684.0223999023438, + "epoch": 0.04420879695317751, + "grad_norm": 0.06257651001214981, + "kl": 0.003589630126953125, + "learning_rate": 4.417910447761194e-07, + "loss": 0.0361, + "reward": 0.14955358067527413, + "reward_std": 0.15166958328336477, + "rewards/accuracy_reward": 0.11160715157166123, + "rewards/format_reward": 0.037946430034935474, + "step": 148 + }, + { + "completion_length": 1618.7679443359375, + "epoch": 0.044507505040698975, + "grad_norm": 0.07572974264621735, + "kl": 0.0034637451171875, + "learning_rate": 4.44776119402985e-07, + "loss": 0.0242, + "reward": 0.1696428582072258, + "reward_std": 0.23315082117915154, + "rewards/accuracy_reward": 0.1138392947614193, + "rewards/format_reward": 0.05580357392318547, + "step": 149 + }, + { + "completion_length": 1807.8014221191406, + "epoch": 0.044806213128220446, + "grad_norm": 0.05359713360667229, + "kl": 0.002826690673828125, + "learning_rate": 4.4776119402985074e-07, + "loss": 0.025, + "reward": 0.11607143562287092, + "reward_std": 0.16258684918284416, + "rewards/accuracy_reward": 0.10937500651925802, + "rewards/format_reward": 0.006696428870782256, + "step": 150 + }, + { + "completion_length": 1721.6942443847656, + "epoch": 0.04510492121574192, + "grad_norm": 0.09048471599817276, + "kl": 0.0038604736328125, + "learning_rate": 4.507462686567164e-07, + "loss": 0.0564, + "reward": 0.07812500605359674, + "reward_std": 0.1421135338023305, + "rewards/accuracy_reward": 0.03125000186264515, + "rewards/format_reward": 0.04687500325962901, + "step": 151 + }, + { + "completion_length": 1776.8036499023438, + "epoch": 0.04540362930326339, + "grad_norm": 0.0687505453824997, + "kl": 0.003997802734375, + "learning_rate": 4.537313432835821e-07, + "loss": 0.0293, + "reward": 0.18080358067527413, + "reward_std": 0.17267218325287104, + "rewards/accuracy_reward": 0.14955357322469354, + "rewards/format_reward": 0.031250000931322575, + "step": 152 + }, + { + "completion_length": 1645.4688110351562, + "epoch": 0.04570233739078486, + "grad_norm": 0.08176202327013016, + "kl": 0.004299163818359375, + "learning_rate": 4.567164179104477e-07, + "loss": 0.0518, + "reward": 0.2678571529686451, + "reward_std": 0.18743117526173592, + "rewards/accuracy_reward": 0.20312501210719347, + "rewards/format_reward": 0.06473214738070965, + "step": 153 + }, + { + "completion_length": 1593.02685546875, + "epoch": 0.04600104547830632, + "grad_norm": 0.08287997543811798, + "kl": 0.0054779052734375, + "learning_rate": 4.5970149253731337e-07, + "loss": 0.0621, + "reward": 0.1763392947614193, + "reward_std": 0.2410423383116722, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.06919643096625805, + "step": 154 + }, + { + "completion_length": 1613.2166137695312, + "epoch": 0.04629975356582779, + "grad_norm": 0.10590478777885437, + "kl": 0.00628662109375, + "learning_rate": 4.626865671641791e-07, + "loss": 0.0948, + "reward": 0.3370535932481289, + "reward_std": 0.2665950134396553, + "rewards/accuracy_reward": 0.2544642984867096, + "rewards/format_reward": 0.0825892873108387, + "step": 155 + }, + { + "completion_length": 1913.1540832519531, + "epoch": 0.046598461653349264, + "grad_norm": 0.06815143674612045, + "kl": 0.00650787353515625, + "learning_rate": 4.6567164179104474e-07, + "loss": 0.0528, + "reward": 0.13616072130389512, + "reward_std": 0.10368260834366083, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.0267857164144516, + "step": 156 + }, + { + "completion_length": 1617.9442443847656, + "epoch": 0.046897169740870735, + "grad_norm": 0.09312456846237183, + "kl": 0.0067901611328125, + "learning_rate": 4.6865671641791045e-07, + "loss": 0.0663, + "reward": 0.18750000931322575, + "reward_std": 0.22063341550529003, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.06696428824216127, + "step": 157 + }, + { + "completion_length": 1649.1272888183594, + "epoch": 0.047195877828392206, + "grad_norm": 0.08927923440933228, + "kl": 0.00637054443359375, + "learning_rate": 4.7164179104477606e-07, + "loss": 0.0672, + "reward": 0.12500000558793545, + "reward_std": 0.2090802900493145, + "rewards/accuracy_reward": 0.06473214644938707, + "rewards/format_reward": 0.060267859138548374, + "step": 158 + }, + { + "completion_length": 1796.4509582519531, + "epoch": 0.04749458591591367, + "grad_norm": 0.07117683440446854, + "kl": 0.0068359375, + "learning_rate": 4.7462686567164177e-07, + "loss": 0.0672, + "reward": 0.23660715855658054, + "reward_std": 0.15340632759034634, + "rewards/accuracy_reward": 0.13169643515720963, + "rewards/format_reward": 0.10491071944124997, + "step": 159 + }, + { + "completion_length": 1682.4777221679688, + "epoch": 0.04779329400343514, + "grad_norm": 0.10011842846870422, + "kl": 0.006519317626953125, + "learning_rate": 4.776119402985074e-07, + "loss": 0.0743, + "reward": 0.27455359138548374, + "reward_std": 0.2056790143251419, + "rewards/accuracy_reward": 0.2075892984867096, + "rewards/format_reward": 0.0669642873108387, + "step": 160 + }, + { + "completion_length": 1645.55810546875, + "epoch": 0.04809200209095661, + "grad_norm": 0.08082977682352066, + "kl": 0.00687408447265625, + "learning_rate": 4.805970149253731e-07, + "loss": 0.0507, + "reward": 0.2053571529686451, + "reward_std": 0.22583198174834251, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.08705357369035482, + "step": 161 + }, + { + "completion_length": 1671.2500305175781, + "epoch": 0.04839071017847808, + "grad_norm": 0.11209368705749512, + "kl": 0.00885772705078125, + "learning_rate": 4.835820895522387e-07, + "loss": 0.0679, + "reward": 0.23660715483129025, + "reward_std": 0.2679184265434742, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.08035714598372579, + "step": 162 + }, + { + "completion_length": 1721.837158203125, + "epoch": 0.048689418265999554, + "grad_norm": 0.08800669759511948, + "kl": 0.00760650634765625, + "learning_rate": 4.865671641791044e-07, + "loss": 0.0782, + "reward": 0.1718750111758709, + "reward_std": 0.252773217856884, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.0848214328289032, + "step": 163 + }, + { + "completion_length": 1662.21435546875, + "epoch": 0.048988126353521025, + "grad_norm": 0.10115931928157806, + "kl": 0.00804901123046875, + "learning_rate": 4.895522388059702e-07, + "loss": 0.0813, + "reward": 0.2299107238650322, + "reward_std": 0.23556623980402946, + "rewards/accuracy_reward": 0.14955357927829027, + "rewards/format_reward": 0.08035714738070965, + "step": 164 + }, + { + "completion_length": 1671.9688110351562, + "epoch": 0.04928683444104249, + "grad_norm": 0.1180092915892601, + "kl": 0.01055145263671875, + "learning_rate": 4.925373134328357e-07, + "loss": 0.0942, + "reward": 0.227678582072258, + "reward_std": 0.2990383207798004, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.0848214328289032, + "step": 165 + }, + { + "completion_length": 1433.9665832519531, + "epoch": 0.04958554252856396, + "grad_norm": 0.12735137343406677, + "kl": 0.01324462890625, + "learning_rate": 4.955223880597015e-07, + "loss": 0.0996, + "reward": 0.4107143133878708, + "reward_std": 0.3602091483771801, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.2209821529686451, + "step": 166 + }, + { + "completion_length": 1639.2880249023438, + "epoch": 0.04988425061608543, + "grad_norm": 0.0764165073633194, + "kl": 0.0125732421875, + "learning_rate": 4.985074626865671e-07, + "loss": 0.0631, + "reward": 0.31696430314332247, + "reward_std": 0.2864900007843971, + "rewards/accuracy_reward": 0.18303572130389512, + "rewards/format_reward": 0.13392857648432255, + "step": 167 + }, + { + "completion_length": 1681.0157165527344, + "epoch": 0.0501829587036069, + "grad_norm": 0.11660360544919968, + "kl": 0.010162353515625, + "learning_rate": 5.014925373134328e-07, + "loss": 0.0888, + "reward": 0.2098214365541935, + "reward_std": 0.2863893024623394, + "rewards/accuracy_reward": 0.08482143469154835, + "rewards/format_reward": 0.1250000037252903, + "step": 168 + }, + { + "completion_length": 1687.2255249023438, + "epoch": 0.05048166679112837, + "grad_norm": 0.09370876103639603, + "kl": 0.01123046875, + "learning_rate": 5.044776119402985e-07, + "loss": 0.0607, + "reward": 0.2299107238650322, + "reward_std": 0.25292637199163437, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.12723215110599995, + "step": 169 + }, + { + "completion_length": 1741.8884887695312, + "epoch": 0.05078037487864984, + "grad_norm": 0.11393048614263535, + "kl": 0.01123046875, + "learning_rate": 5.074626865671642e-07, + "loss": 0.0487, + "reward": 0.3214285783469677, + "reward_std": 0.3029906377196312, + "rewards/accuracy_reward": 0.18526786798611283, + "rewards/format_reward": 0.13616071827709675, + "step": 170 + }, + { + "completion_length": 1655.0089721679688, + "epoch": 0.05107908296617131, + "grad_norm": 0.10353364050388336, + "kl": 0.0130157470703125, + "learning_rate": 5.104477611940299e-07, + "loss": 0.0603, + "reward": 0.2187500074505806, + "reward_std": 0.22533397004008293, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.1428571529686451, + "step": 171 + }, + { + "completion_length": 1717.2054443359375, + "epoch": 0.05137779105369278, + "grad_norm": 0.11801345646381378, + "kl": 0.0144805908203125, + "learning_rate": 5.134328358208954e-07, + "loss": 0.0896, + "reward": 0.1830357275903225, + "reward_std": 0.24658836424350739, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.1160714328289032, + "step": 172 + }, + { + "completion_length": 1745.8348693847656, + "epoch": 0.05167649914121425, + "grad_norm": 0.10562747716903687, + "kl": 0.0128021240234375, + "learning_rate": 5.164179104477612e-07, + "loss": 0.0781, + "reward": 0.2611607238650322, + "reward_std": 0.2496781200170517, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.14955357555299997, + "step": 173 + }, + { + "completion_length": 1819.196533203125, + "epoch": 0.05197520722873572, + "grad_norm": 0.10385270416736603, + "kl": 0.0142822265625, + "learning_rate": 5.194029850746269e-07, + "loss": 0.068, + "reward": 0.16741072200238705, + "reward_std": 0.25611402466893196, + "rewards/accuracy_reward": 0.06473214575089514, + "rewards/format_reward": 0.10267857648432255, + "step": 174 + }, + { + "completion_length": 1821.415283203125, + "epoch": 0.052273915316257184, + "grad_norm": 0.09053435921669006, + "kl": 0.0123443603515625, + "learning_rate": 5.223880597014924e-07, + "loss": 0.0792, + "reward": 0.212053582072258, + "reward_std": 0.2563618794083595, + "rewards/accuracy_reward": 0.09375000419095159, + "rewards/format_reward": 0.1183035746216774, + "step": 175 + }, + { + "completion_length": 1701.24560546875, + "epoch": 0.052572623403778655, + "grad_norm": 0.0999012216925621, + "kl": 0.015899658203125, + "learning_rate": 5.253731343283582e-07, + "loss": 0.0828, + "reward": 0.3571428656578064, + "reward_std": 0.306869275867939, + "rewards/accuracy_reward": 0.1808035746216774, + "rewards/format_reward": 0.176339291036129, + "step": 176 + }, + { + "completion_length": 1711.21435546875, + "epoch": 0.052871331491300126, + "grad_norm": 0.12118327617645264, + "kl": 0.0204010009765625, + "learning_rate": 5.283582089552238e-07, + "loss": 0.0777, + "reward": 0.3593750223517418, + "reward_std": 0.3074590042233467, + "rewards/accuracy_reward": 0.2075892984867096, + "rewards/format_reward": 0.15178572200238705, + "step": 177 + }, + { + "completion_length": 1629.1943054199219, + "epoch": 0.0531700395788216, + "grad_norm": 0.12350395321846008, + "kl": 0.0163421630859375, + "learning_rate": 5.313432835820896e-07, + "loss": 0.1175, + "reward": 0.4151785969734192, + "reward_std": 0.35816677659749985, + "rewards/accuracy_reward": 0.2254464402794838, + "rewards/format_reward": 0.1897321492433548, + "step": 178 + }, + { + "completion_length": 1643.8728332519531, + "epoch": 0.05346874766634307, + "grad_norm": 0.12544038891792297, + "kl": 0.0164947509765625, + "learning_rate": 5.343283582089552e-07, + "loss": 0.1185, + "reward": 0.3816964477300644, + "reward_std": 0.33595414087176323, + "rewards/accuracy_reward": 0.16964285913854837, + "rewards/format_reward": 0.21205358393490314, + "step": 179 + }, + { + "completion_length": 1754.6072082519531, + "epoch": 0.05376745575386454, + "grad_norm": 0.09999144822359085, + "kl": 0.0173492431640625, + "learning_rate": 5.373134328358208e-07, + "loss": 0.0703, + "reward": 0.2991071566939354, + "reward_std": 0.2554607167840004, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.1696428656578064, + "step": 180 + }, + { + "completion_length": 1671.0045166015625, + "epoch": 0.054066163841386, + "grad_norm": 0.12121982872486115, + "kl": 0.01995849609375, + "learning_rate": 5.402985074626866e-07, + "loss": 0.0842, + "reward": 0.2566964440047741, + "reward_std": 0.2976120039820671, + "rewards/accuracy_reward": 0.07366071990691125, + "rewards/format_reward": 0.1830357238650322, + "step": 181 + }, + { + "completion_length": 1698.0759582519531, + "epoch": 0.054364871928907474, + "grad_norm": 0.1133880764245987, + "kl": 0.019622802734375, + "learning_rate": 5.432835820895521e-07, + "loss": 0.0952, + "reward": 0.4486607238650322, + "reward_std": 0.37482012808322906, + "rewards/accuracy_reward": 0.23437500931322575, + "rewards/format_reward": 0.2142857238650322, + "step": 182 + }, + { + "completion_length": 1699.1027526855469, + "epoch": 0.054663580016428945, + "grad_norm": 0.16318629682064056, + "kl": 0.019073486328125, + "learning_rate": 5.462686567164179e-07, + "loss": 0.1286, + "reward": 0.3437500111758709, + "reward_std": 0.38481505215168, + "rewards/accuracy_reward": 0.14732143376022577, + "rewards/format_reward": 0.1964285857975483, + "step": 183 + }, + { + "completion_length": 1658.7723999023438, + "epoch": 0.054962288103950416, + "grad_norm": 0.12987668812274933, + "kl": 0.022705078125, + "learning_rate": 5.492537313432836e-07, + "loss": 0.0853, + "reward": 0.3705357313156128, + "reward_std": 0.2576402835547924, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.2566964328289032, + "step": 184 + }, + { + "completion_length": 1741.6272888183594, + "epoch": 0.05526099619147189, + "grad_norm": 0.1551598310470581, + "kl": 0.023345947265625, + "learning_rate": 5.522388059701492e-07, + "loss": 0.111, + "reward": 0.2700892947614193, + "reward_std": 0.31142664328217506, + "rewards/accuracy_reward": 0.10937500465661287, + "rewards/format_reward": 0.16071429662406445, + "step": 185 + }, + { + "completion_length": 1685.4353332519531, + "epoch": 0.05555970427899335, + "grad_norm": 0.13633951544761658, + "kl": 0.020294189453125, + "learning_rate": 5.552238805970149e-07, + "loss": 0.0786, + "reward": 0.3593750149011612, + "reward_std": 0.3169918358325958, + "rewards/accuracy_reward": 0.14285715203732252, + "rewards/format_reward": 0.2165178656578064, + "step": 186 + }, + { + "completion_length": 1744.6763916015625, + "epoch": 0.05585841236651482, + "grad_norm": 0.15413451194763184, + "kl": 0.02252197265625, + "learning_rate": 5.582089552238805e-07, + "loss": 0.1102, + "reward": 0.3191964402794838, + "reward_std": 0.33089349046349525, + "rewards/accuracy_reward": 0.16517857927829027, + "rewards/format_reward": 0.1540178656578064, + "step": 187 + }, + { + "completion_length": 1761.0715026855469, + "epoch": 0.05615712045403629, + "grad_norm": 0.11624488979578018, + "kl": 0.02471923828125, + "learning_rate": 5.611940298507463e-07, + "loss": 0.0804, + "reward": 0.3549107350409031, + "reward_std": 0.3065344877541065, + "rewards/accuracy_reward": 0.15848214970901608, + "rewards/format_reward": 0.1964285746216774, + "step": 188 + }, + { + "completion_length": 1839.4398193359375, + "epoch": 0.056455828541557763, + "grad_norm": 0.09898454695940018, + "kl": 0.02398681640625, + "learning_rate": 5.64179104477612e-07, + "loss": 0.0735, + "reward": 0.2700892984867096, + "reward_std": 0.2769235372543335, + "rewards/accuracy_reward": 0.12500000675208867, + "rewards/format_reward": 0.1450892947614193, + "step": 189 + }, + { + "completion_length": 1658.0491943359375, + "epoch": 0.056754536629079234, + "grad_norm": 0.13869500160217285, + "kl": 0.02392578125, + "learning_rate": 5.671641791044775e-07, + "loss": 0.0974, + "reward": 0.4330357387661934, + "reward_std": 0.3657972142100334, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.2522321604192257, + "step": 190 + }, + { + "completion_length": 1765.58935546875, + "epoch": 0.057053244716600705, + "grad_norm": 0.19944781064987183, + "kl": 0.02874755859375, + "learning_rate": 5.701492537313433e-07, + "loss": 0.0894, + "reward": 0.3549107238650322, + "reward_std": 0.31916364282369614, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.212053582072258, + "step": 191 + }, + { + "completion_length": 1792.7925109863281, + "epoch": 0.05735195280412217, + "grad_norm": 0.12319160252809525, + "kl": 0.030426025390625, + "learning_rate": 5.731343283582089e-07, + "loss": 0.0715, + "reward": 0.2968750149011612, + "reward_std": 0.26519451290369034, + "rewards/accuracy_reward": 0.14285714644938707, + "rewards/format_reward": 0.1540178656578064, + "step": 192 + }, + { + "completion_length": 1632.1117248535156, + "epoch": 0.05765066089164364, + "grad_norm": 0.20008476078510284, + "kl": 0.0330810546875, + "learning_rate": 5.761194029850746e-07, + "loss": 0.1152, + "reward": 0.4687500149011612, + "reward_std": 0.342165544629097, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.2745535783469677, + "step": 193 + }, + { + "completion_length": 1801.7611999511719, + "epoch": 0.05794936897916511, + "grad_norm": 0.1259770542383194, + "kl": 0.035736083984375, + "learning_rate": 5.791044776119403e-07, + "loss": 0.0814, + "reward": 0.23437501303851604, + "reward_std": 0.22866392135620117, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.1718750074505806, + "step": 194 + }, + { + "completion_length": 1730.9844360351562, + "epoch": 0.05824807706668658, + "grad_norm": 0.3481791913509369, + "kl": 0.04144287109375, + "learning_rate": 5.820895522388059e-07, + "loss": 0.1078, + "reward": 0.290178582072258, + "reward_std": 0.3072606511414051, + "rewards/accuracy_reward": 0.08258929220028222, + "rewards/format_reward": 0.2075892984867096, + "step": 195 + }, + { + "completion_length": 1721.6183776855469, + "epoch": 0.05854678515420805, + "grad_norm": 0.19897222518920898, + "kl": 0.03759765625, + "learning_rate": 5.850746268656717e-07, + "loss": 0.1091, + "reward": 0.372767873108387, + "reward_std": 0.3793536201119423, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.2299107313156128, + "step": 196 + }, + { + "completion_length": 1732.5246276855469, + "epoch": 0.05884549324172952, + "grad_norm": 0.12458215653896332, + "kl": 0.03839111328125, + "learning_rate": 5.880597014925372e-07, + "loss": 0.0678, + "reward": 0.310267873108387, + "reward_std": 0.3005557134747505, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.227678582072258, + "step": 197 + }, + { + "completion_length": 1685.3638916015625, + "epoch": 0.05914420132925099, + "grad_norm": 0.16912227869033813, + "kl": 0.03924560546875, + "learning_rate": 5.91044776119403e-07, + "loss": 0.0932, + "reward": 0.38392858766019344, + "reward_std": 0.3285790719091892, + "rewards/accuracy_reward": 0.149553582072258, + "rewards/format_reward": 0.23437501303851604, + "step": 198 + }, + { + "completion_length": 1641.2456359863281, + "epoch": 0.05944290941677246, + "grad_norm": 0.16178147494792938, + "kl": 0.04144287109375, + "learning_rate": 5.940298507462687e-07, + "loss": 0.0984, + "reward": 0.4017857238650322, + "reward_std": 0.3985406756401062, + "rewards/accuracy_reward": 0.08035714831203222, + "rewards/format_reward": 0.3214285895228386, + "step": 199 + }, + { + "completion_length": 1708.7634887695312, + "epoch": 0.05974161750429393, + "grad_norm": 0.2996484637260437, + "kl": 0.0523681640625, + "learning_rate": 5.970149253731342e-07, + "loss": 0.1325, + "reward": 0.3125000149011612, + "reward_std": 0.32465052604675293, + "rewards/accuracy_reward": 0.05133928777649999, + "rewards/format_reward": 0.26116073317825794, + "step": 200 + }, + { + "completion_length": 1669.6139221191406, + "epoch": 0.0600403255918154, + "grad_norm": 0.2551124691963196, + "kl": 0.0555419921875, + "learning_rate": 6e-07, + "loss": 0.1128, + "reward": 0.3593750111758709, + "reward_std": 0.33468087017536163, + "rewards/accuracy_reward": 0.09821428940631449, + "rewards/format_reward": 0.2611607164144516, + "step": 201 + }, + { + "completion_length": 1795.8327026367188, + "epoch": 0.060339033679336865, + "grad_norm": 0.26417356729507446, + "kl": 0.05950927734375, + "learning_rate": 6.029850746268656e-07, + "loss": 0.1045, + "reward": 0.3370535932481289, + "reward_std": 0.32652005553245544, + "rewards/accuracy_reward": 0.13169643841683865, + "rewards/format_reward": 0.2053571492433548, + "step": 202 + }, + { + "completion_length": 1741.8058776855469, + "epoch": 0.060637741766858336, + "grad_norm": 0.25112929940223694, + "kl": 0.05731201171875, + "learning_rate": 6.059701492537314e-07, + "loss": 0.0951, + "reward": 0.3013393022119999, + "reward_std": 0.3186227157711983, + "rewards/accuracy_reward": 0.05133928684517741, + "rewards/format_reward": 0.2500000149011612, + "step": 203 + }, + { + "completion_length": 1734.5536499023438, + "epoch": 0.06093644985437981, + "grad_norm": 0.2541242837905884, + "kl": 0.0694580078125, + "learning_rate": 6.08955223880597e-07, + "loss": 0.0949, + "reward": 0.377232164144516, + "reward_std": 0.29866689071059227, + "rewards/accuracy_reward": 0.14508929220028222, + "rewards/format_reward": 0.2321428656578064, + "step": 204 + }, + { + "completion_length": 1573.5023193359375, + "epoch": 0.06123515794190128, + "grad_norm": 0.2980501651763916, + "kl": 0.0640869140625, + "learning_rate": 6.119402985074626e-07, + "loss": 0.1044, + "reward": 0.5000000186264515, + "reward_std": 0.42898450046777725, + "rewards/accuracy_reward": 0.15401786426082253, + "rewards/format_reward": 0.3459821566939354, + "step": 205 + }, + { + "completion_length": 1687.2567749023438, + "epoch": 0.06153386602942275, + "grad_norm": 0.4412866532802582, + "kl": 0.0743408203125, + "learning_rate": 6.149253731343284e-07, + "loss": 0.1327, + "reward": 0.348214291036129, + "reward_std": 0.3207091614603996, + "rewards/accuracy_reward": 0.05803571501746774, + "rewards/format_reward": 0.2901785895228386, + "step": 206 + }, + { + "completion_length": 1619.857177734375, + "epoch": 0.06183257411694422, + "grad_norm": 0.47801923751831055, + "kl": 0.088623046875, + "learning_rate": 6.17910447761194e-07, + "loss": 0.1412, + "reward": 0.5245535969734192, + "reward_std": 0.32957956939935684, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.3258928693830967, + "step": 207 + }, + { + "completion_length": 1721.3952026367188, + "epoch": 0.06213128220446568, + "grad_norm": 0.4009077250957489, + "kl": 0.085205078125, + "learning_rate": 6.208955223880597e-07, + "loss": 0.1133, + "reward": 0.2968750149011612, + "reward_std": 0.3253879137337208, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.2566964365541935, + "step": 208 + }, + { + "completion_length": 1635.2568054199219, + "epoch": 0.062429990291987154, + "grad_norm": 0.7505097389221191, + "kl": 0.12109375, + "learning_rate": 6.238805970149253e-07, + "loss": 0.1372, + "reward": 0.3883928656578064, + "reward_std": 0.340421661734581, + "rewards/accuracy_reward": 0.10491071734577417, + "rewards/format_reward": 0.2834821566939354, + "step": 209 + }, + { + "completion_length": 1697.8929138183594, + "epoch": 0.06272869837950862, + "grad_norm": 0.46380335092544556, + "kl": 0.113525390625, + "learning_rate": 6.26865671641791e-07, + "loss": 0.1033, + "reward": 0.404017873108387, + "reward_std": 0.3804030641913414, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.3147321604192257, + "step": 210 + }, + { + "completion_length": 1674.2433776855469, + "epoch": 0.06302740646703009, + "grad_norm": 0.7191123962402344, + "kl": 0.141357421875, + "learning_rate": 6.298507462686567e-07, + "loss": 0.1468, + "reward": 0.5044643059372902, + "reward_std": 0.4193657115101814, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.3303571492433548, + "step": 211 + }, + { + "completion_length": 1740.1719665527344, + "epoch": 0.06332611455455156, + "grad_norm": 0.43751055002212524, + "kl": 0.142333984375, + "learning_rate": 6.328358208955223e-07, + "loss": 0.0854, + "reward": 0.3638393059372902, + "reward_std": 0.3256142921745777, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.2254464365541935, + "step": 212 + }, + { + "completion_length": 1641.8103637695312, + "epoch": 0.06362482264207303, + "grad_norm": 0.7987541556358337, + "kl": 0.16748046875, + "learning_rate": 6.358208955223881e-07, + "loss": 0.161, + "reward": 0.4955357313156128, + "reward_std": 0.4464737996459007, + "rewards/accuracy_reward": 0.1473214402794838, + "rewards/format_reward": 0.3482143059372902, + "step": 213 + }, + { + "completion_length": 1631.1697082519531, + "epoch": 0.0639235307295945, + "grad_norm": 0.8613948822021484, + "kl": 0.216064453125, + "learning_rate": 6.388059701492537e-07, + "loss": 0.1348, + "reward": 0.5290178880095482, + "reward_std": 0.4338912218809128, + "rewards/accuracy_reward": 0.20312500931322575, + "rewards/format_reward": 0.3258928656578064, + "step": 214 + }, + { + "completion_length": 1699.57373046875, + "epoch": 0.06422223881711597, + "grad_norm": 1.074483871459961, + "kl": 0.261962890625, + "learning_rate": 6.417910447761193e-07, + "loss": 0.1646, + "reward": 0.3950893059372902, + "reward_std": 0.3954741209745407, + "rewards/accuracy_reward": 0.0758928582072258, + "rewards/format_reward": 0.3191964402794838, + "step": 215 + }, + { + "completion_length": 1593.6139221191406, + "epoch": 0.06452094690463744, + "grad_norm": 0.7977161407470703, + "kl": 0.33935546875, + "learning_rate": 6.447761194029851e-07, + "loss": 0.1538, + "reward": 0.4196428805589676, + "reward_std": 0.43761545419692993, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.3392857275903225, + "step": 216 + }, + { + "completion_length": 1661.2366943359375, + "epoch": 0.06481965499215891, + "grad_norm": 1.0203351974487305, + "kl": 0.357421875, + "learning_rate": 6.477611940298507e-07, + "loss": 0.1782, + "reward": 0.486607164144516, + "reward_std": 0.408330962061882, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.3125000186264515, + "step": 217 + }, + { + "completion_length": 1595.3951416015625, + "epoch": 0.06511836307968039, + "grad_norm": 1.1183630228042603, + "kl": 0.43994140625, + "learning_rate": 6.507462686567164e-07, + "loss": 0.1741, + "reward": 0.4419643133878708, + "reward_std": 0.46462395787239075, + "rewards/accuracy_reward": 0.09375000186264515, + "rewards/format_reward": 0.3482142984867096, + "step": 218 + }, + { + "completion_length": 1681.7880249023438, + "epoch": 0.06541707116720186, + "grad_norm": 1.0607340335845947, + "kl": 0.44775390625, + "learning_rate": 6.53731343283582e-07, + "loss": 0.1407, + "reward": 0.3593750223517418, + "reward_std": 0.442565880715847, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.2745535857975483, + "step": 219 + }, + { + "completion_length": 1703.2300109863281, + "epoch": 0.06571577925472333, + "grad_norm": 1.4614052772521973, + "kl": 0.56689453125, + "learning_rate": 6.567164179104477e-07, + "loss": 0.1564, + "reward": 0.377232164144516, + "reward_std": 0.39441412687301636, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.2879464477300644, + "step": 220 + }, + { + "completion_length": 1605.6072082519531, + "epoch": 0.06601448734224478, + "grad_norm": 1.248780369758606, + "kl": 0.7080078125, + "learning_rate": 6.597014925373135e-07, + "loss": 0.2079, + "reward": 0.4263393133878708, + "reward_std": 0.4287726357579231, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.3504464402794838, + "step": 221 + }, + { + "completion_length": 1610.5938415527344, + "epoch": 0.06631319542976626, + "grad_norm": 1.1759556531906128, + "kl": 0.830078125, + "learning_rate": 6.62686567164179e-07, + "loss": 0.1855, + "reward": 0.444196455180645, + "reward_std": 0.43647076189517975, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.3191964402794838, + "step": 222 + }, + { + "completion_length": 1488.7366638183594, + "epoch": 0.06661190351728773, + "grad_norm": 1.4881452322006226, + "kl": 1.0068359375, + "learning_rate": 6.656716417910448e-07, + "loss": 0.2137, + "reward": 0.5334821566939354, + "reward_std": 0.48269442468881607, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.3660714402794838, + "step": 223 + }, + { + "completion_length": 1524.6094360351562, + "epoch": 0.0669106116048092, + "grad_norm": 1.895758867263794, + "kl": 1.0068359375, + "learning_rate": 6.686567164179104e-07, + "loss": 0.2344, + "reward": 0.5848214700818062, + "reward_std": 0.5107344761490822, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.4107143059372902, + "step": 224 + }, + { + "completion_length": 1649.0670166015625, + "epoch": 0.06720931969233067, + "grad_norm": 1.6031278371810913, + "kl": 1.1533203125, + "learning_rate": 6.716417910447761e-07, + "loss": 0.2006, + "reward": 0.419642873108387, + "reward_std": 0.4547761231660843, + "rewards/accuracy_reward": 0.05803571850992739, + "rewards/format_reward": 0.361607164144516, + "step": 225 + }, + { + "completion_length": 1552.8214721679688, + "epoch": 0.06750802777985214, + "grad_norm": 2.026189088821411, + "kl": 1.2734375, + "learning_rate": 6.746268656716418e-07, + "loss": 0.2806, + "reward": 0.4888392984867096, + "reward_std": 0.4805227667093277, + "rewards/accuracy_reward": 0.0714285729918629, + "rewards/format_reward": 0.4174107387661934, + "step": 226 + }, + { + "completion_length": 1436.2969360351562, + "epoch": 0.06780673586737361, + "grad_norm": 2.3676578998565674, + "kl": 1.3125, + "learning_rate": 6.776119402985074e-07, + "loss": 0.2567, + "reward": 0.6093750149011612, + "reward_std": 0.5208212956786156, + "rewards/accuracy_reward": 0.1004464365541935, + "rewards/format_reward": 0.5089285969734192, + "step": 227 + }, + { + "completion_length": 1417.2188110351562, + "epoch": 0.06810544395489508, + "grad_norm": 2.987501382827759, + "kl": 1.259765625, + "learning_rate": 6.805970149253732e-07, + "loss": 0.2634, + "reward": 0.5691964477300644, + "reward_std": 0.5369150042533875, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.475446455180645, + "step": 228 + }, + { + "completion_length": 1417.6272888183594, + "epoch": 0.06840415204241655, + "grad_norm": 2.9510490894317627, + "kl": 1.689453125, + "learning_rate": 6.835820895522387e-07, + "loss": 0.3204, + "reward": 0.4665178880095482, + "reward_std": 0.48381660133600235, + "rewards/accuracy_reward": 0.03348214365541935, + "rewards/format_reward": 0.4330357238650322, + "step": 229 + }, + { + "completion_length": 1345.6272888183594, + "epoch": 0.06870286012993802, + "grad_norm": 2.6557655334472656, + "kl": 1.533203125, + "learning_rate": 6.865671641791044e-07, + "loss": 0.2004, + "reward": 0.647321455180645, + "reward_std": 0.5034443810582161, + "rewards/accuracy_reward": 0.08035714528523386, + "rewards/format_reward": 0.5669643208384514, + "step": 230 + }, + { + "completion_length": 1234.7299499511719, + "epoch": 0.0690015682174595, + "grad_norm": 2.2186009883880615, + "kl": 1.74609375, + "learning_rate": 6.895522388059702e-07, + "loss": 0.255, + "reward": 0.6919643133878708, + "reward_std": 0.5169767588376999, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.5691964626312256, + "step": 231 + }, + { + "completion_length": 1127.8750610351562, + "epoch": 0.06930027630498095, + "grad_norm": 3.1867644786834717, + "kl": 1.84765625, + "learning_rate": 6.925373134328358e-07, + "loss": 0.2783, + "reward": 0.707589328289032, + "reward_std": 0.4749508872628212, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.6183035969734192, + "step": 232 + }, + { + "completion_length": 1022.9152069091797, + "epoch": 0.06959898439250242, + "grad_norm": 3.1773500442504883, + "kl": 1.75390625, + "learning_rate": 6.955223880597014e-07, + "loss": 0.211, + "reward": 0.7879464626312256, + "reward_std": 0.4819011986255646, + "rewards/accuracy_reward": 0.10491071920841932, + "rewards/format_reward": 0.6830357611179352, + "step": 233 + }, + { + "completion_length": 1167.8080749511719, + "epoch": 0.06989769248002389, + "grad_norm": 3.352207899093628, + "kl": 1.685546875, + "learning_rate": 6.985074626865671e-07, + "loss": 0.2675, + "reward": 0.7589286118745804, + "reward_std": 0.48940306156873703, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.6294643133878708, + "step": 234 + }, + { + "completion_length": 1249.9576416015625, + "epoch": 0.07019640056754536, + "grad_norm": 2.618262767791748, + "kl": 1.390625, + "learning_rate": 7.014925373134328e-07, + "loss": 0.1791, + "reward": 0.7299107611179352, + "reward_std": 0.48216912150382996, + "rewards/accuracy_reward": 0.06473214458674192, + "rewards/format_reward": 0.6651785969734192, + "step": 235 + }, + { + "completion_length": 1164.386215209961, + "epoch": 0.07049510865506683, + "grad_norm": 3.885664463043213, + "kl": 1.2275390625, + "learning_rate": 7.044776119402985e-07, + "loss": 0.2198, + "reward": 0.8660714626312256, + "reward_std": 0.46064580231904984, + "rewards/accuracy_reward": 0.16071429220028222, + "rewards/format_reward": 0.705357164144516, + "step": 236 + }, + { + "completion_length": 988.6205749511719, + "epoch": 0.0707938167425883, + "grad_norm": 2.404611349105835, + "kl": 1.19921875, + "learning_rate": 7.074626865671641e-07, + "loss": 0.1483, + "reward": 0.8258928954601288, + "reward_std": 0.41661906242370605, + "rewards/accuracy_reward": 0.08482143399305642, + "rewards/format_reward": 0.7410714626312256, + "step": 237 + }, + { + "completion_length": 899.4263916015625, + "epoch": 0.07109252483010978, + "grad_norm": 3.690487861633301, + "kl": 0.970703125, + "learning_rate": 7.104477611940298e-07, + "loss": 0.1678, + "reward": 0.854910746216774, + "reward_std": 0.4539136365056038, + "rewards/accuracy_reward": 0.10044643492437899, + "rewards/format_reward": 0.754464328289032, + "step": 238 + }, + { + "completion_length": 983.4710083007812, + "epoch": 0.07139123291763125, + "grad_norm": 3.161799430847168, + "kl": 0.8720703125, + "learning_rate": 7.134328358208955e-07, + "loss": 0.1914, + "reward": 0.9062500596046448, + "reward_std": 0.47655583173036575, + "rewards/accuracy_reward": 0.13616072107106447, + "rewards/format_reward": 0.7700893133878708, + "step": 239 + }, + { + "completion_length": 984.0268402099609, + "epoch": 0.07168994100515272, + "grad_norm": 1.460526704788208, + "kl": 0.791015625, + "learning_rate": 7.164179104477611e-07, + "loss": 0.0858, + "reward": 0.8839286118745804, + "reward_std": 0.36870284378528595, + "rewards/accuracy_reward": 0.06473214784637094, + "rewards/format_reward": 0.8191964626312256, + "step": 240 + }, + { + "completion_length": 982.4397888183594, + "epoch": 0.07198864909267419, + "grad_norm": 2.1282126903533936, + "kl": 0.8447265625, + "learning_rate": 7.194029850746269e-07, + "loss": 0.1542, + "reward": 0.917410746216774, + "reward_std": 0.48304181545972824, + "rewards/accuracy_reward": 0.15625000977888703, + "rewards/format_reward": 0.761160746216774, + "step": 241 + }, + { + "completion_length": 991.1317443847656, + "epoch": 0.07228735718019566, + "grad_norm": 2.124147891998291, + "kl": 0.974609375, + "learning_rate": 7.223880597014925e-07, + "loss": 0.225, + "reward": 0.9263393133878708, + "reward_std": 0.4514753818511963, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.7790178954601288, + "step": 242 + }, + { + "completion_length": 1113.2991485595703, + "epoch": 0.07258606526771712, + "grad_norm": 2.9307796955108643, + "kl": 0.7197265625, + "learning_rate": 7.253731343283582e-07, + "loss": 0.2126, + "reward": 0.8415178954601288, + "reward_std": 0.4466625824570656, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.7589285969734192, + "step": 243 + }, + { + "completion_length": 1016.8303985595703, + "epoch": 0.07288477335523859, + "grad_norm": 1.8246028423309326, + "kl": 0.8681640625, + "learning_rate": 7.283582089552238e-07, + "loss": 0.1898, + "reward": 0.975446492433548, + "reward_std": 0.40434835851192474, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.7879464626312256, + "step": 244 + }, + { + "completion_length": 1096.2522888183594, + "epoch": 0.07318348144276006, + "grad_norm": 1.1390173435211182, + "kl": 0.787109375, + "learning_rate": 7.313432835820895e-07, + "loss": 0.1359, + "reward": 0.823660746216774, + "reward_std": 0.4095231145620346, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.7812500298023224, + "step": 245 + }, + { + "completion_length": 1152.8973846435547, + "epoch": 0.07348218953028153, + "grad_norm": 2.4069037437438965, + "kl": 0.669921875, + "learning_rate": 7.343283582089553e-07, + "loss": 0.1742, + "reward": 1.0178571790456772, + "reward_std": 0.41132067888975143, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.8058036118745804, + "step": 246 + }, + { + "completion_length": 1031.2187805175781, + "epoch": 0.073780897617803, + "grad_norm": 1.9315718412399292, + "kl": 0.9931640625, + "learning_rate": 7.373134328358208e-07, + "loss": 0.259, + "reward": 0.9553572088479996, + "reward_std": 0.4197695404291153, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.8169643133878708, + "step": 247 + }, + { + "completion_length": 1144.6562805175781, + "epoch": 0.07407960570532447, + "grad_norm": 1.8262662887573242, + "kl": 0.69970703125, + "learning_rate": 7.402985074626865e-07, + "loss": 0.1914, + "reward": 0.8973214775323868, + "reward_std": 0.3979450389742851, + "rewards/accuracy_reward": 0.06696428661234677, + "rewards/format_reward": 0.8303571939468384, + "step": 248 + }, + { + "completion_length": 1108.35498046875, + "epoch": 0.07437831379284594, + "grad_norm": 1.583028793334961, + "kl": 1.115234375, + "learning_rate": 7.432835820895522e-07, + "loss": 0.2065, + "reward": 0.85714291036129, + "reward_std": 0.401195727288723, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.7589285969734192, + "step": 249 + }, + { + "completion_length": 1049.5625305175781, + "epoch": 0.07467702188036741, + "grad_norm": 1.5225874185562134, + "kl": 0.9091796875, + "learning_rate": 7.462686567164179e-07, + "loss": 0.1993, + "reward": 0.8437500298023224, + "reward_std": 0.44488631188869476, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.7678571790456772, + "step": 250 + }, + { + "completion_length": 1268.2991638183594, + "epoch": 0.07497572996788888, + "grad_norm": 2.436020612716675, + "kl": 1.083984375, + "learning_rate": 7.492537313432836e-07, + "loss": 0.2141, + "reward": 0.7656250298023224, + "reward_std": 0.4563822075724602, + "rewards/accuracy_reward": 0.03571428661234677, + "rewards/format_reward": 0.729910746216774, + "step": 251 + }, + { + "completion_length": 1135.3683624267578, + "epoch": 0.07527443805541036, + "grad_norm": 1.7251806259155273, + "kl": 0.876953125, + "learning_rate": 7.522388059701492e-07, + "loss": 0.2035, + "reward": 0.808035746216774, + "reward_std": 0.4210737869143486, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.7566964626312256, + "step": 252 + }, + { + "completion_length": 1102.6763916015625, + "epoch": 0.07557314614293181, + "grad_norm": 2.796931743621826, + "kl": 0.65576171875, + "learning_rate": 7.552238805970149e-07, + "loss": 0.2553, + "reward": 0.9620536267757416, + "reward_std": 0.4821387752890587, + "rewards/accuracy_reward": 0.19642858300358057, + "rewards/format_reward": 0.7656250298023224, + "step": 253 + }, + { + "completion_length": 1150.5625305175781, + "epoch": 0.07587185423045328, + "grad_norm": 1.6462198495864868, + "kl": 0.55419921875, + "learning_rate": 7.582089552238805e-07, + "loss": 0.0834, + "reward": 0.8839285969734192, + "reward_std": 0.4257707819342613, + "rewards/accuracy_reward": 0.08482143143191934, + "rewards/format_reward": 0.7991071790456772, + "step": 254 + }, + { + "completion_length": 1246.3750305175781, + "epoch": 0.07617056231797475, + "grad_norm": 1.5417709350585938, + "kl": 0.818359375, + "learning_rate": 7.611940298507462e-07, + "loss": 0.1745, + "reward": 0.9040178805589676, + "reward_std": 0.4074876457452774, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.785714328289032, + "step": 255 + }, + { + "completion_length": 1210.8170166015625, + "epoch": 0.07646927040549623, + "grad_norm": 1.4749326705932617, + "kl": 0.6552734375, + "learning_rate": 7.64179104477612e-07, + "loss": 0.1912, + "reward": 0.8750000298023224, + "reward_std": 0.38686656206846237, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.7879464626312256, + "step": 256 + }, + { + "completion_length": 1306.575927734375, + "epoch": 0.0767679784930177, + "grad_norm": 1.1722294092178345, + "kl": 0.67431640625, + "learning_rate": 7.671641791044776e-07, + "loss": 0.1842, + "reward": 0.8281250447034836, + "reward_std": 0.3978957533836365, + "rewards/accuracy_reward": 0.055803575087338686, + "rewards/format_reward": 0.7723214626312256, + "step": 257 + }, + { + "completion_length": 1270.6049499511719, + "epoch": 0.07706668658053917, + "grad_norm": 1.1967602968215942, + "kl": 0.6953125, + "learning_rate": 7.701492537313432e-07, + "loss": 0.0814, + "reward": 0.9642857760190964, + "reward_std": 0.43505850434303284, + "rewards/accuracy_reward": 0.17410714738070965, + "rewards/format_reward": 0.7901786118745804, + "step": 258 + }, + { + "completion_length": 1321.5134582519531, + "epoch": 0.07736539466806064, + "grad_norm": 1.381617546081543, + "kl": 0.7265625, + "learning_rate": 7.731343283582089e-07, + "loss": 0.1446, + "reward": 0.8147321790456772, + "reward_std": 0.4017602503299713, + "rewards/accuracy_reward": 0.04910714388824999, + "rewards/format_reward": 0.7656250298023224, + "step": 259 + }, + { + "completion_length": 1297.9710388183594, + "epoch": 0.07766410275558211, + "grad_norm": 1.1765743494033813, + "kl": 0.7763671875, + "learning_rate": 7.761194029850746e-07, + "loss": 0.1469, + "reward": 0.8169643133878708, + "reward_std": 0.49085356295108795, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.7500000298023224, + "step": 260 + }, + { + "completion_length": 1534.6183776855469, + "epoch": 0.07796281084310358, + "grad_norm": 0.9562254548072815, + "kl": 0.751953125, + "learning_rate": 7.791044776119404e-07, + "loss": 0.1499, + "reward": 0.7544643133878708, + "reward_std": 0.46734773367643356, + "rewards/accuracy_reward": 0.031250001629814506, + "rewards/format_reward": 0.723214328289032, + "step": 261 + }, + { + "completion_length": 1515.4219360351562, + "epoch": 0.07826151893062505, + "grad_norm": 2.768551826477051, + "kl": 0.7265625, + "learning_rate": 7.820895522388059e-07, + "loss": 0.0765, + "reward": 0.7745535969734192, + "reward_std": 0.47462020069360733, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.6986607313156128, + "step": 262 + }, + { + "completion_length": 1553.4063415527344, + "epoch": 0.07856022701814652, + "grad_norm": 1.5613338947296143, + "kl": 0.5498046875, + "learning_rate": 7.850746268656716e-07, + "loss": 0.0691, + "reward": 0.8504464775323868, + "reward_std": 0.4210139587521553, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.761160746216774, + "step": 263 + }, + { + "completion_length": 1453.3014221191406, + "epoch": 0.07885893510566798, + "grad_norm": 0.8884609341621399, + "kl": 0.46142578125, + "learning_rate": 7.880597014925373e-07, + "loss": 0.0307, + "reward": 0.88839291036129, + "reward_std": 0.37291476130485535, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.8102678954601288, + "step": 264 + }, + { + "completion_length": 1579.9263916015625, + "epoch": 0.07915764319318945, + "grad_norm": 1.1799639463424683, + "kl": 0.4814453125, + "learning_rate": 7.910447761194029e-07, + "loss": 0.0357, + "reward": 0.7812500447034836, + "reward_std": 0.43595267087221146, + "rewards/accuracy_reward": 0.042410715483129025, + "rewards/format_reward": 0.738839328289032, + "step": 265 + }, + { + "completion_length": 1632.7255249023438, + "epoch": 0.07945635128071092, + "grad_norm": 1.9442304372787476, + "kl": 0.41455078125, + "learning_rate": 7.940298507462686e-07, + "loss": 0.0587, + "reward": 0.767857164144516, + "reward_std": 0.43099992722272873, + "rewards/accuracy_reward": 0.013392857974395156, + "rewards/format_reward": 0.754464328289032, + "step": 266 + }, + { + "completion_length": 1512.24560546875, + "epoch": 0.07975505936823239, + "grad_norm": 1.405840516090393, + "kl": 0.35791015625, + "learning_rate": 7.970149253731343e-07, + "loss": 0.0509, + "reward": 0.9129464775323868, + "reward_std": 0.4004775509238243, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.816964328289032, + "step": 267 + }, + { + "completion_length": 1573.9063415527344, + "epoch": 0.08005376745575386, + "grad_norm": 1.1799770593643188, + "kl": 0.282958984375, + "learning_rate": 8e-07, + "loss": -0.0137, + "reward": 0.917410746216774, + "reward_std": 0.39242667704820633, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.8303571790456772, + "step": 268 + }, + { + "completion_length": 1553.0313415527344, + "epoch": 0.08035247554327533, + "grad_norm": 1.8872207403182983, + "kl": 0.28564453125, + "learning_rate": 8.029850746268656e-07, + "loss": 0.064, + "reward": 0.9129464626312256, + "reward_std": 0.4602925032377243, + "rewards/accuracy_reward": 0.15401786472648382, + "rewards/format_reward": 0.7589285969734192, + "step": 269 + }, + { + "completion_length": 1535.7523193359375, + "epoch": 0.0806511836307968, + "grad_norm": 2.5444858074188232, + "kl": 0.27734375, + "learning_rate": 8.059701492537313e-07, + "loss": 0.0494, + "reward": 1.0111607760190964, + "reward_std": 0.44800323247909546, + "rewards/accuracy_reward": 0.21428572572767735, + "rewards/format_reward": 0.7968750447034836, + "step": 270 + }, + { + "completion_length": 1612.2388916015625, + "epoch": 0.08094989171831828, + "grad_norm": 3.9535982608795166, + "kl": 0.34521484375, + "learning_rate": 8.08955223880597e-07, + "loss": 0.0722, + "reward": 0.8459821790456772, + "reward_std": 0.5313192903995514, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.7008928805589676, + "step": 271 + }, + { + "completion_length": 1456.80810546875, + "epoch": 0.08124859980583975, + "grad_norm": 2.375042200088501, + "kl": 0.342529296875, + "learning_rate": 8.119402985074626e-07, + "loss": 0.0065, + "reward": 0.8950893133878708, + "reward_std": 0.43078020215034485, + "rewards/accuracy_reward": 0.11160714877769351, + "rewards/format_reward": 0.7834821939468384, + "step": 272 + }, + { + "completion_length": 1391.9754943847656, + "epoch": 0.08154730789336122, + "grad_norm": 2.2077202796936035, + "kl": 0.374755859375, + "learning_rate": 8.149253731343283e-07, + "loss": -0.0513, + "reward": 0.9285714626312256, + "reward_std": 0.43731626868247986, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.8281250298023224, + "step": 273 + }, + { + "completion_length": 1398.1741638183594, + "epoch": 0.08184601598088269, + "grad_norm": 111.725341796875, + "kl": 2.670166015625, + "learning_rate": 8.17910447761194e-07, + "loss": 0.1755, + "reward": 1.1049107611179352, + "reward_std": 0.41415711492300034, + "rewards/accuracy_reward": 0.2723214402794838, + "rewards/format_reward": 0.832589328289032, + "step": 274 + }, + { + "completion_length": 1343.1808776855469, + "epoch": 0.08214472406840415, + "grad_norm": 1.2166038751602173, + "kl": 0.38427734375, + "learning_rate": 8.208955223880597e-07, + "loss": 0.0351, + "reward": 0.9955357611179352, + "reward_std": 0.428667388856411, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.8415178954601288, + "step": 275 + }, + { + "completion_length": 1313.7857666015625, + "epoch": 0.08244343215592562, + "grad_norm": 3.6078550815582275, + "kl": 0.226318359375, + "learning_rate": 8.238805970149253e-07, + "loss": 0.0362, + "reward": 0.9933035969734192, + "reward_std": 0.38541190326213837, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.8750000298023224, + "step": 276 + }, + { + "completion_length": 1407.8973693847656, + "epoch": 0.08274214024344709, + "grad_norm": 0.45252111554145813, + "kl": 0.1875, + "learning_rate": 8.26865671641791e-07, + "loss": 0.0301, + "reward": 0.9486607611179352, + "reward_std": 0.4469023868441582, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.7901785969734192, + "step": 277 + }, + { + "completion_length": 1440.51123046875, + "epoch": 0.08304084833096856, + "grad_norm": 0.2397698312997818, + "kl": 0.2095947265625, + "learning_rate": 8.298507462686567e-07, + "loss": -0.0167, + "reward": 0.9308036118745804, + "reward_std": 0.44533832371234894, + "rewards/accuracy_reward": 0.12946429289877415, + "rewards/format_reward": 0.8013393133878708, + "step": 278 + }, + { + "completion_length": 1330.1786193847656, + "epoch": 0.08333955641849003, + "grad_norm": 0.6860783100128174, + "kl": 0.14306640625, + "learning_rate": 8.328358208955224e-07, + "loss": 0.0754, + "reward": 0.93526791036129, + "reward_std": 0.4273504912853241, + "rewards/accuracy_reward": 0.12723214784637094, + "rewards/format_reward": 0.8080357611179352, + "step": 279 + }, + { + "completion_length": 1374.3683471679688, + "epoch": 0.0836382645060115, + "grad_norm": 1.1493628025054932, + "kl": 0.713134765625, + "learning_rate": 8.35820895522388e-07, + "loss": 0.0829, + "reward": 0.8660714626312256, + "reward_std": 0.40816473215818405, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.7812500298023224, + "step": 280 + }, + { + "completion_length": 1348.8929443359375, + "epoch": 0.08393697259353297, + "grad_norm": 0.7376496195793152, + "kl": 0.1605224609375, + "learning_rate": 8.388059701492537e-07, + "loss": 0.064, + "reward": 0.9486607611179352, + "reward_std": 0.41827329248189926, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.8125000298023224, + "step": 281 + }, + { + "completion_length": 1403.9598693847656, + "epoch": 0.08423568068105444, + "grad_norm": 1.2076518535614014, + "kl": 0.0802001953125, + "learning_rate": 8.417910447761194e-07, + "loss": 0.0845, + "reward": 1.0200893580913544, + "reward_std": 0.42826147377491, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.816964328289032, + "step": 282 + }, + { + "completion_length": 1383.7634582519531, + "epoch": 0.08453438876857591, + "grad_norm": 3.4429352283477783, + "kl": 0.10113525390625, + "learning_rate": 8.44776119402985e-07, + "loss": 0.0952, + "reward": 0.91964291036129, + "reward_std": 0.39051342010498047, + "rewards/accuracy_reward": 0.11383929592557251, + "rewards/format_reward": 0.8058036118745804, + "step": 283 + }, + { + "completion_length": 1367.216552734375, + "epoch": 0.08483309685609738, + "grad_norm": 2.6421263217926025, + "kl": 0.144775390625, + "learning_rate": 8.477611940298507e-07, + "loss": 0.0814, + "reward": 1.0446428805589676, + "reward_std": 0.38067271932959557, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.8593750447034836, + "step": 284 + }, + { + "completion_length": 1460.7857971191406, + "epoch": 0.08513180494361886, + "grad_norm": 4.2304463386535645, + "kl": 0.1934814453125, + "learning_rate": 8.507462686567164e-07, + "loss": 0.0582, + "reward": 0.9062500298023224, + "reward_std": 0.35469773411750793, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.808035746216774, + "step": 285 + }, + { + "completion_length": 1336.7054138183594, + "epoch": 0.08543051303114031, + "grad_norm": 3.1734936237335205, + "kl": 0.36767578125, + "learning_rate": 8.537313432835821e-07, + "loss": 0.0723, + "reward": 1.0870536267757416, + "reward_std": 0.33245836198329926, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.879464328289032, + "step": 286 + }, + { + "completion_length": 1341.9308471679688, + "epoch": 0.08572922111866178, + "grad_norm": 0.6650550961494446, + "kl": 0.43701171875, + "learning_rate": 8.567164179104477e-07, + "loss": 0.1032, + "reward": 1.0848215073347092, + "reward_std": 0.3209443725645542, + "rewards/accuracy_reward": 0.20758929662406445, + "rewards/format_reward": 0.8772321790456772, + "step": 287 + }, + { + "completion_length": 1427.1831359863281, + "epoch": 0.08602792920618325, + "grad_norm": 0.862920343875885, + "kl": 0.5146484375, + "learning_rate": 8.597014925373134e-07, + "loss": 0.0322, + "reward": 0.9062500447034836, + "reward_std": 0.326714351773262, + "rewards/accuracy_reward": 0.0468750037252903, + "rewards/format_reward": 0.8593750447034836, + "step": 288 + }, + { + "completion_length": 1433.0514221191406, + "epoch": 0.08632663729370472, + "grad_norm": 1.402682900428772, + "kl": 0.5654296875, + "learning_rate": 8.626865671641791e-07, + "loss": 0.0529, + "reward": 0.9933035969734192, + "reward_std": 0.31057019904255867, + "rewards/accuracy_reward": 0.12500000419095159, + "rewards/format_reward": 0.8683035969734192, + "step": 289 + }, + { + "completion_length": 1363.7857971191406, + "epoch": 0.0866253453812262, + "grad_norm": 0.5374418497085571, + "kl": 0.55810546875, + "learning_rate": 8.656716417910447e-07, + "loss": 0.053, + "reward": 1.1026786416769028, + "reward_std": 0.32186245173215866, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9017857611179352, + "step": 290 + }, + { + "completion_length": 1340.68310546875, + "epoch": 0.08692405346874767, + "grad_norm": 0.6400312185287476, + "kl": 0.4365234375, + "learning_rate": 8.686567164179104e-07, + "loss": 0.0335, + "reward": 1.1205357909202576, + "reward_std": 0.3210472948849201, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.9062500447034836, + "step": 291 + }, + { + "completion_length": 1281.0469360351562, + "epoch": 0.08722276155626914, + "grad_norm": 0.7262424230575562, + "kl": 0.4765625, + "learning_rate": 8.716417910447761e-07, + "loss": 0.0778, + "reward": 0.957589328289032, + "reward_std": 0.3808277025818825, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.8683035969734192, + "step": 292 + }, + { + "completion_length": 1326.1340026855469, + "epoch": 0.08752146964379061, + "grad_norm": 1.011535882949829, + "kl": 0.39697265625, + "learning_rate": 8.746268656716418e-07, + "loss": 0.0658, + "reward": 1.089285746216774, + "reward_std": 0.30048296228051186, + "rewards/accuracy_reward": 0.18303572572767735, + "rewards/format_reward": 0.9062500596046448, + "step": 293 + }, + { + "completion_length": 1474.1764221191406, + "epoch": 0.08782017773131208, + "grad_norm": 1.9278242588043213, + "kl": 0.369140625, + "learning_rate": 8.776119402985074e-07, + "loss": 0.0409, + "reward": 0.9196428954601288, + "reward_std": 0.3303867131471634, + "rewards/accuracy_reward": 0.046875002793967724, + "rewards/format_reward": 0.8727678954601288, + "step": 294 + }, + { + "completion_length": 1476.6339721679688, + "epoch": 0.08811888581883355, + "grad_norm": 1.0983775854110718, + "kl": 0.333984375, + "learning_rate": 8.805970149253731e-07, + "loss": 0.0546, + "reward": 1.0200893133878708, + "reward_std": 0.3329809308052063, + "rewards/accuracy_reward": 0.13169643748551607, + "rewards/format_reward": 0.88839291036129, + "step": 295 + }, + { + "completion_length": 1389.2076416015625, + "epoch": 0.08841759390635502, + "grad_norm": 2.006490707397461, + "kl": 0.37060546875, + "learning_rate": 8.835820895522388e-07, + "loss": 0.0827, + "reward": 1.037946492433548, + "reward_std": 0.3296668715775013, + "rewards/accuracy_reward": 0.14508929196745157, + "rewards/format_reward": 0.8928571790456772, + "step": 296 + }, + { + "completion_length": 1338.5781555175781, + "epoch": 0.08871630199387648, + "grad_norm": 1.349937081336975, + "kl": 0.30078125, + "learning_rate": 8.865671641791045e-07, + "loss": 0.0723, + "reward": 1.0580357611179352, + "reward_std": 0.3332560881972313, + "rewards/accuracy_reward": 0.15848214784637094, + "rewards/format_reward": 0.8995536267757416, + "step": 297 + }, + { + "completion_length": 1320.5848693847656, + "epoch": 0.08901501008139795, + "grad_norm": 0.8452871441841125, + "kl": 0.298583984375, + "learning_rate": 8.8955223880597e-07, + "loss": 0.0618, + "reward": 0.926339328289032, + "reward_std": 0.31348854303359985, + "rewards/accuracy_reward": 0.07812500302679837, + "rewards/format_reward": 0.848214328289032, + "step": 298 + }, + { + "completion_length": 1171.3326416015625, + "epoch": 0.08931371816891942, + "grad_norm": 0.7818227410316467, + "kl": 0.2470703125, + "learning_rate": 8.925373134328358e-07, + "loss": 0.0391, + "reward": 0.9799107611179352, + "reward_std": 0.20694895833730698, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.9285714775323868, + "step": 299 + }, + { + "completion_length": 1248.7366638183594, + "epoch": 0.08961242625644089, + "grad_norm": 0.6492113471031189, + "kl": 0.293212890625, + "learning_rate": 8.955223880597015e-07, + "loss": 0.0628, + "reward": 1.1205357611179352, + "reward_std": 0.3488881178200245, + "rewards/accuracy_reward": 0.24330358998849988, + "rewards/format_reward": 0.8772321939468384, + "step": 300 + }, + { + "completion_length": 1342.825927734375, + "epoch": 0.08991113434396236, + "grad_norm": 2.039257287979126, + "kl": 0.3828125, + "learning_rate": 8.98507462686567e-07, + "loss": 0.0973, + "reward": 0.944196492433548, + "reward_std": 0.36238838732242584, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.8325893133878708, + "step": 301 + }, + { + "completion_length": 1209.2835388183594, + "epoch": 0.09020984243148383, + "grad_norm": 0.3687964081764221, + "kl": 0.2099609375, + "learning_rate": 9.014925373134328e-07, + "loss": 0.0413, + "reward": 1.0446428954601288, + "reward_std": 0.3109283596277237, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.88839291036129, + "step": 302 + }, + { + "completion_length": 1336.435302734375, + "epoch": 0.0905085505190053, + "grad_norm": 0.8232405781745911, + "kl": 0.18310546875, + "learning_rate": 9.044776119402984e-07, + "loss": 0.0949, + "reward": 0.9218750596046448, + "reward_std": 0.3930703476071358, + "rewards/accuracy_reward": 0.11160715017467737, + "rewards/format_reward": 0.8102678954601288, + "step": 303 + }, + { + "completion_length": 1396.8973999023438, + "epoch": 0.09080725860652678, + "grad_norm": 0.7211048007011414, + "kl": 0.184326171875, + "learning_rate": 9.074626865671642e-07, + "loss": 0.097, + "reward": 0.8191964626312256, + "reward_std": 0.40971020609140396, + "rewards/accuracy_reward": 0.0848214291036129, + "rewards/format_reward": 0.7343750298023224, + "step": 304 + }, + { + "completion_length": 1301.1094360351562, + "epoch": 0.09110596669404825, + "grad_norm": 0.786010205745697, + "kl": 0.14892578125, + "learning_rate": 9.104477611940298e-07, + "loss": 0.1262, + "reward": 0.8348214775323868, + "reward_std": 0.4173192232847214, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.7522321790456772, + "step": 305 + }, + { + "completion_length": 1374.9911193847656, + "epoch": 0.09140467478156972, + "grad_norm": 0.9563223719596863, + "kl": 0.177001953125, + "learning_rate": 9.134328358208954e-07, + "loss": 0.1088, + "reward": 0.8392857313156128, + "reward_std": 0.46519042551517487, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.6941964626312256, + "step": 306 + }, + { + "completion_length": 1230.3772888183594, + "epoch": 0.09170338286909117, + "grad_norm": 3.1136395931243896, + "kl": 0.236572265625, + "learning_rate": 9.164179104477612e-07, + "loss": 0.1138, + "reward": 1.0022322088479996, + "reward_std": 0.41185739636421204, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.8035714775323868, + "step": 307 + }, + { + "completion_length": 1310.8906860351562, + "epoch": 0.09200209095661264, + "grad_norm": 1.2998679876327515, + "kl": 0.22802734375, + "learning_rate": 9.194029850746267e-07, + "loss": 0.1313, + "reward": 0.8839286118745804, + "reward_std": 0.44626687467098236, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.7723214626312256, + "step": 308 + }, + { + "completion_length": 1118.9687805175781, + "epoch": 0.09230079904413412, + "grad_norm": 0.4615563750267029, + "kl": 0.2138671875, + "learning_rate": 9.223880597014925e-07, + "loss": 0.0833, + "reward": 0.9732143431901932, + "reward_std": 0.4016486033797264, + "rewards/accuracy_reward": 0.16294643469154835, + "rewards/format_reward": 0.8102678954601288, + "step": 309 + }, + { + "completion_length": 1081.2322082519531, + "epoch": 0.09259950713165559, + "grad_norm": 3.518371820449829, + "kl": 0.278564453125, + "learning_rate": 9.253731343283582e-07, + "loss": 0.0685, + "reward": 0.9642857611179352, + "reward_std": 0.39580587297677994, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.801339328289032, + "step": 310 + }, + { + "completion_length": 1156.2835388183594, + "epoch": 0.09289821521917706, + "grad_norm": 1.0692121982574463, + "kl": 0.2158203125, + "learning_rate": 9.283582089552238e-07, + "loss": 0.1144, + "reward": 0.8906250447034836, + "reward_std": 0.38030610233545303, + "rewards/accuracy_reward": 0.08482143119908869, + "rewards/format_reward": 0.8058035969734192, + "step": 311 + }, + { + "completion_length": 1104.4465026855469, + "epoch": 0.09319692330669853, + "grad_norm": 0.6787509918212891, + "kl": 0.250244140625, + "learning_rate": 9.313432835820895e-07, + "loss": 0.0965, + "reward": 1.0200893133878708, + "reward_std": 0.4406134560704231, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.7901785969734192, + "step": 312 + }, + { + "completion_length": 1215.2344360351562, + "epoch": 0.09349563139422, + "grad_norm": 0.7815726399421692, + "kl": 0.234375, + "learning_rate": 9.343283582089551e-07, + "loss": 0.0863, + "reward": 0.9062500447034836, + "reward_std": 0.36298441141843796, + "rewards/accuracy_reward": 0.08705357438884676, + "rewards/format_reward": 0.8191964477300644, + "step": 313 + }, + { + "completion_length": 1163.9933471679688, + "epoch": 0.09379433948174147, + "grad_norm": 5.95475435256958, + "kl": 0.374755859375, + "learning_rate": 9.373134328358209e-07, + "loss": 0.0887, + "reward": 0.9151786118745804, + "reward_std": 0.38382738083601, + "rewards/accuracy_reward": 0.11383929033763707, + "rewards/format_reward": 0.801339328289032, + "step": 314 + }, + { + "completion_length": 1174.1540832519531, + "epoch": 0.09409304756926294, + "grad_norm": 0.9133674502372742, + "kl": 0.241455078125, + "learning_rate": 9.402985074626866e-07, + "loss": 0.1041, + "reward": 0.9352678954601288, + "reward_std": 0.39955058693885803, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.8125000447034836, + "step": 315 + }, + { + "completion_length": 1223.404052734375, + "epoch": 0.09439175565678441, + "grad_norm": 0.9763960242271423, + "kl": 0.22607421875, + "learning_rate": 9.432835820895521e-07, + "loss": 0.1371, + "reward": 1.082589328289032, + "reward_std": 0.45458976179361343, + "rewards/accuracy_reward": 0.2678571529686451, + "rewards/format_reward": 0.8147321790456772, + "step": 316 + }, + { + "completion_length": 1080.66748046875, + "epoch": 0.09469046374430588, + "grad_norm": 2.033527135848999, + "kl": 0.219970703125, + "learning_rate": 9.462686567164179e-07, + "loss": 0.1416, + "reward": 0.9308036118745804, + "reward_std": 0.37274259328842163, + "rewards/accuracy_reward": 0.14062500488944352, + "rewards/format_reward": 0.7901786118745804, + "step": 317 + }, + { + "completion_length": 1193.2835083007812, + "epoch": 0.09498917183182734, + "grad_norm": 1.3314160108566284, + "kl": 0.242919921875, + "learning_rate": 9.492537313432835e-07, + "loss": 0.1486, + "reward": 0.8861607760190964, + "reward_std": 0.3964538499712944, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.7879464626312256, + "step": 318 + }, + { + "completion_length": 1123.4911193847656, + "epoch": 0.09528787991934881, + "grad_norm": 1.9087787866592407, + "kl": 0.297607421875, + "learning_rate": 9.522388059701492e-07, + "loss": 0.1075, + "reward": 0.9285714775323868, + "reward_std": 0.4511011838912964, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.792410746216774, + "step": 319 + }, + { + "completion_length": 1149.513427734375, + "epoch": 0.09558658800687028, + "grad_norm": 0.8674450516700745, + "kl": 0.225341796875, + "learning_rate": 9.552238805970149e-07, + "loss": 0.0989, + "reward": 0.8482143431901932, + "reward_std": 0.42597106099128723, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.7723214775323868, + "step": 320 + }, + { + "completion_length": 1103.5625610351562, + "epoch": 0.09588529609439175, + "grad_norm": 1.7767112255096436, + "kl": 0.2744140625, + "learning_rate": 9.582089552238805e-07, + "loss": 0.1054, + "reward": 0.8995536118745804, + "reward_std": 0.44139377772808075, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.7700893133878708, + "step": 321 + }, + { + "completion_length": 1202.0536499023438, + "epoch": 0.09618400418191322, + "grad_norm": 2.3822853565216064, + "kl": 0.37109375, + "learning_rate": 9.611940298507462e-07, + "loss": 0.1506, + "reward": 0.8593750447034836, + "reward_std": 0.48177944868803024, + "rewards/accuracy_reward": 0.13392857508733869, + "rewards/format_reward": 0.7254464626312256, + "step": 322 + }, + { + "completion_length": 1236.3103332519531, + "epoch": 0.0964827122694347, + "grad_norm": 1.6573885679244995, + "kl": 0.41064453125, + "learning_rate": 9.641791044776118e-07, + "loss": 0.1154, + "reward": 0.7879464626312256, + "reward_std": 0.42466332763433456, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.6875000447034836, + "step": 323 + }, + { + "completion_length": 1102.7835540771484, + "epoch": 0.09678142035695617, + "grad_norm": 1.0656088590621948, + "kl": 0.42529296875, + "learning_rate": 9.671641791044775e-07, + "loss": 0.1227, + "reward": 0.8236607611179352, + "reward_std": 0.4408448338508606, + "rewards/accuracy_reward": 0.04017857275903225, + "rewards/format_reward": 0.7834821790456772, + "step": 324 + }, + { + "completion_length": 1053.2500457763672, + "epoch": 0.09708012844447764, + "grad_norm": 0.9572561383247375, + "kl": 0.47705078125, + "learning_rate": 9.701492537313434e-07, + "loss": 0.0998, + "reward": 0.9084821790456772, + "reward_std": 0.4475301131606102, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.7745536118745804, + "step": 325 + }, + { + "completion_length": 1138.857192993164, + "epoch": 0.09737883653199911, + "grad_norm": 1.5107921361923218, + "kl": 0.52587890625, + "learning_rate": 9.731343283582088e-07, + "loss": 0.1286, + "reward": 0.854910746216774, + "reward_std": 0.5181312263011932, + "rewards/accuracy_reward": 0.1718750037252903, + "rewards/format_reward": 0.683035746216774, + "step": 326 + }, + { + "completion_length": 1118.091537475586, + "epoch": 0.09767754461952058, + "grad_norm": 2.6772518157958984, + "kl": 0.5791015625, + "learning_rate": 9.761194029850745e-07, + "loss": 0.1664, + "reward": 0.7991071939468384, + "reward_std": 0.5277887135744095, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.642857164144516, + "step": 327 + }, + { + "completion_length": 1076.3437957763672, + "epoch": 0.09797625270704205, + "grad_norm": 4.565573692321777, + "kl": 0.60546875, + "learning_rate": 9.791044776119403e-07, + "loss": 0.1844, + "reward": 0.8459821790456772, + "reward_std": 0.4482894465327263, + "rewards/accuracy_reward": 0.14062500675208867, + "rewards/format_reward": 0.7053571790456772, + "step": 328 + }, + { + "completion_length": 1073.9129943847656, + "epoch": 0.0982749607945635, + "grad_norm": 2.759122371673584, + "kl": 0.70556640625, + "learning_rate": 9.82089552238806e-07, + "loss": 0.1359, + "reward": 0.7968750447034836, + "reward_std": 0.4878402650356293, + "rewards/accuracy_reward": 0.12723215110599995, + "rewards/format_reward": 0.6696428954601288, + "step": 329 + }, + { + "completion_length": 1109.1629943847656, + "epoch": 0.09857366888208498, + "grad_norm": 40745.9140625, + "kl": 273.953125, + "learning_rate": 9.850746268656714e-07, + "loss": 13.9118, + "reward": 0.7075893133878708, + "reward_std": 0.4703219309449196, + "rewards/accuracy_reward": 0.040178574388846755, + "rewards/format_reward": 0.667410746216774, + "step": 330 + }, + { + "completion_length": 1159.1116638183594, + "epoch": 0.09887237696960645, + "grad_norm": 62127.6484375, + "kl": 519.75, + "learning_rate": 9.880597014925373e-07, + "loss": 31.3244, + "reward": 0.7946428954601288, + "reward_std": 0.49274180829524994, + "rewards/accuracy_reward": 0.10267857764847577, + "rewards/format_reward": 0.6919643133878708, + "step": 331 + }, + { + "completion_length": 1214.2500305175781, + "epoch": 0.09917108505712792, + "grad_norm": 31663.39453125, + "kl": 200.3125, + "learning_rate": 9.91044776119403e-07, + "loss": 9.2805, + "reward": 0.7500000298023224, + "reward_std": 0.4751335233449936, + "rewards/accuracy_reward": 0.09151785913854837, + "rewards/format_reward": 0.658482164144516, + "step": 332 + }, + { + "completion_length": 1206.9732666015625, + "epoch": 0.09946979314464939, + "grad_norm": 121.09956359863281, + "kl": 4.94140625, + "learning_rate": 9.940298507462686e-07, + "loss": 0.5323, + "reward": 0.6495535969734192, + "reward_std": 0.5223339274525642, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.5825892984867096, + "step": 333 + }, + { + "completion_length": 1211.6518249511719, + "epoch": 0.09976850123217086, + "grad_norm": 13.670095443725586, + "kl": 2.255859375, + "learning_rate": 9.970149253731343e-07, + "loss": 0.3811, + "reward": 0.7455357611179352, + "reward_std": 0.5133942440152168, + "rewards/accuracy_reward": 0.10044643329456449, + "rewards/format_reward": 0.6450893133878708, + "step": 334 + }, + { + "completion_length": 1095.4955749511719, + "epoch": 0.10006720931969233, + "grad_norm": 18.739032745361328, + "kl": 1.892578125, + "learning_rate": 1e-06, + "loss": 0.3119, + "reward": 0.7678571939468384, + "reward_std": 0.5001451149582863, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.6473214477300644, + "step": 335 + }, + { + "completion_length": 1119.9442443847656, + "epoch": 0.1003659174072138, + "grad_norm": 14.446822166442871, + "kl": 1.947265625, + "learning_rate": 9.999997552220525e-07, + "loss": 0.3746, + "reward": 0.7031250447034836, + "reward_std": 0.525195524096489, + "rewards/accuracy_reward": 0.06026786100119352, + "rewards/format_reward": 0.642857164144516, + "step": 336 + }, + { + "completion_length": 1197.6161193847656, + "epoch": 0.10066462549473527, + "grad_norm": 61.89336395263672, + "kl": 4.244140625, + "learning_rate": 9.999990208884757e-07, + "loss": 0.4673, + "reward": 0.7031250149011612, + "reward_std": 0.4832093343138695, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.6294643133878708, + "step": 337 + }, + { + "completion_length": 1216.6853332519531, + "epoch": 0.10096333358225675, + "grad_norm": 11.157196998596191, + "kl": 1.994140625, + "learning_rate": 9.99997797000069e-07, + "loss": 0.2948, + "reward": 0.7366071864962578, + "reward_std": 0.5073533952236176, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.6004464477300644, + "step": 338 + }, + { + "completion_length": 1205.2411193847656, + "epoch": 0.10126204166977822, + "grad_norm": 15.466999053955078, + "kl": 1.1416015625, + "learning_rate": 9.999960835581636e-07, + "loss": 0.3085, + "reward": 0.6964285969734192, + "reward_std": 0.49573398381471634, + "rewards/accuracy_reward": 0.07142857369035482, + "rewards/format_reward": 0.6250000298023224, + "step": 339 + }, + { + "completion_length": 1174.5089721679688, + "epoch": 0.10156074975729967, + "grad_norm": 8.844285011291504, + "kl": 1.0224609375, + "learning_rate": 9.999938805646239e-07, + "loss": 0.2985, + "reward": 0.6741071790456772, + "reward_std": 0.5250176787376404, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.6138393133878708, + "step": 340 + }, + { + "completion_length": 1354.4018859863281, + "epoch": 0.10185945784482114, + "grad_norm": 9.394603729248047, + "kl": 1.67578125, + "learning_rate": 9.999911880218462e-07, + "loss": 0.3413, + "reward": 0.6651785969734192, + "reward_std": 0.5552077889442444, + "rewards/accuracy_reward": 0.15848215389996767, + "rewards/format_reward": 0.506696455180645, + "step": 341 + }, + { + "completion_length": 1254.8014221191406, + "epoch": 0.10215816593234262, + "grad_norm": 11.832247734069824, + "kl": 1.1484375, + "learning_rate": 9.999880059327598e-07, + "loss": 0.2856, + "reward": 0.714285746216774, + "reward_std": 0.5608807951211929, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.5915178805589676, + "step": 342 + }, + { + "completion_length": 1243.3705749511719, + "epoch": 0.10245687401986409, + "grad_norm": 10.943278312683105, + "kl": 1.3369140625, + "learning_rate": 9.999843343008264e-07, + "loss": 0.2802, + "reward": 0.714285746216774, + "reward_std": 0.4745236337184906, + "rewards/accuracy_reward": 0.09151785913854837, + "rewards/format_reward": 0.6227678805589676, + "step": 343 + }, + { + "completion_length": 1340.15185546875, + "epoch": 0.10275558210738556, + "grad_norm": 17.950252532958984, + "kl": 3.31640625, + "learning_rate": 9.999801731300407e-07, + "loss": 0.4759, + "reward": 0.6473214700818062, + "reward_std": 0.5579516589641571, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.529017873108387, + "step": 344 + }, + { + "completion_length": 1301.0870971679688, + "epoch": 0.10305429019490703, + "grad_norm": 8.80040168762207, + "kl": 2.091796875, + "learning_rate": 9.999755224249292e-07, + "loss": 0.3868, + "reward": 0.6272321790456772, + "reward_std": 0.5239974409341812, + "rewards/accuracy_reward": 0.05133928940631449, + "rewards/format_reward": 0.5758928954601288, + "step": 345 + }, + { + "completion_length": 1414.26123046875, + "epoch": 0.1033529982824285, + "grad_norm": 12.530730247497559, + "kl": 1.3720703125, + "learning_rate": 9.999703821905516e-07, + "loss": 0.2887, + "reward": 0.5714286044239998, + "reward_std": 0.49533750116825104, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.488839291036129, + "step": 346 + }, + { + "completion_length": 1545.9531860351562, + "epoch": 0.10365170636994997, + "grad_norm": 11.185501098632812, + "kl": 1.04296875, + "learning_rate": 9.999647524325e-07, + "loss": 0.251, + "reward": 0.522321455180645, + "reward_std": 0.5196292698383331, + "rewards/accuracy_reward": 0.06919643306173384, + "rewards/format_reward": 0.4531250149011612, + "step": 347 + }, + { + "completion_length": 1466.2723999023438, + "epoch": 0.10395041445747144, + "grad_norm": 4.4588236808776855, + "kl": 1.697265625, + "learning_rate": 9.999586331568992e-07, + "loss": 0.3027, + "reward": 0.5602678805589676, + "reward_std": 0.5614064335823059, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.4598214477300644, + "step": 348 + }, + { + "completion_length": 1480.6027221679688, + "epoch": 0.10424912254499291, + "grad_norm": 3.9899826049804688, + "kl": 1.533203125, + "learning_rate": 9.999520243704064e-07, + "loss": 0.2466, + "reward": 0.587053582072258, + "reward_std": 0.5590255707502365, + "rewards/accuracy_reward": 0.13169643841683865, + "rewards/format_reward": 0.455357164144516, + "step": 349 + }, + { + "completion_length": 1371.8527526855469, + "epoch": 0.10454783063251437, + "grad_norm": 3.274871826171875, + "kl": 1.2978515625, + "learning_rate": 9.999449260802107e-07, + "loss": 0.2678, + "reward": 0.6875000298023224, + "reward_std": 0.5005280897021294, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.5424107387661934, + "step": 350 + }, + { + "completion_length": 1480.6607666015625, + "epoch": 0.10484653872003584, + "grad_norm": 1.8933440446853638, + "kl": 0.92138671875, + "learning_rate": 9.99937338294035e-07, + "loss": 0.2513, + "reward": 0.6183035895228386, + "reward_std": 0.42989182472229004, + "rewards/accuracy_reward": 0.11607143515720963, + "rewards/format_reward": 0.5022321790456772, + "step": 351 + }, + { + "completion_length": 1482.8973999023438, + "epoch": 0.10514524680755731, + "grad_norm": 2.5377864837646484, + "kl": 0.7236328125, + "learning_rate": 9.999292610201339e-07, + "loss": 0.2599, + "reward": 0.600446455180645, + "reward_std": 0.5438080579042435, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.4732143059372902, + "step": 352 + }, + { + "completion_length": 1460.8416137695312, + "epoch": 0.10544395489507878, + "grad_norm": 1.5391151905059814, + "kl": 0.525390625, + "learning_rate": 9.999206942672944e-07, + "loss": 0.1901, + "reward": 0.6383928954601288, + "reward_std": 0.462944395840168, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.4977678880095482, + "step": 353 + }, + { + "completion_length": 1568.7991638183594, + "epoch": 0.10574266298260025, + "grad_norm": 1.8102600574493408, + "kl": 0.34228515625, + "learning_rate": 9.999116380448367e-07, + "loss": 0.2292, + "reward": 0.6026785969734192, + "reward_std": 0.4785761535167694, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.4062500074505806, + "step": 354 + }, + { + "completion_length": 1647.9531860351562, + "epoch": 0.10604137107012172, + "grad_norm": 1.8610316514968872, + "kl": 0.58349609375, + "learning_rate": 9.999020923626128e-07, + "loss": 0.2469, + "reward": 0.3750000074505806, + "reward_std": 0.4058934301137924, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.3303571492433548, + "step": 355 + }, + { + "completion_length": 1712.5960693359375, + "epoch": 0.1063400791576432, + "grad_norm": 201.33712768554688, + "kl": 3.47265625, + "learning_rate": 9.998920572310075e-07, + "loss": 0.331, + "reward": 0.301339291036129, + "reward_std": 0.3950934484601021, + "rewards/accuracy_reward": 0.06026786123402417, + "rewards/format_reward": 0.2410714402794838, + "step": 356 + }, + { + "completion_length": 1573.3014221191406, + "epoch": 0.10663878724516467, + "grad_norm": 339.5614929199219, + "kl": 5.1474609375, + "learning_rate": 9.998815326609384e-07, + "loss": 0.5132, + "reward": 0.4687500298023224, + "reward_std": 0.47018471360206604, + "rewards/accuracy_reward": 0.058035716181620955, + "rewards/format_reward": 0.4107142984867096, + "step": 357 + }, + { + "completion_length": 1691.0826721191406, + "epoch": 0.10693749533268614, + "grad_norm": 12.394099235534668, + "kl": 0.63671875, + "learning_rate": 9.998705186638546e-07, + "loss": 0.2184, + "reward": 0.3571428805589676, + "reward_std": 0.4453444257378578, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.3102678656578064, + "step": 358 + }, + { + "completion_length": 1709.8661499023438, + "epoch": 0.10723620342020761, + "grad_norm": 2.3911757469177246, + "kl": 0.27880859375, + "learning_rate": 9.998590152517387e-07, + "loss": 0.2123, + "reward": 0.3013393059372902, + "reward_std": 0.44345876574516296, + "rewards/accuracy_reward": 0.04464286030270159, + "rewards/format_reward": 0.2566964365541935, + "step": 359 + }, + { + "completion_length": 1719.8705749511719, + "epoch": 0.10753491150772908, + "grad_norm": 0.779998242855072, + "kl": 0.34228515625, + "learning_rate": 9.99847022437105e-07, + "loss": 0.1505, + "reward": 0.3772321566939354, + "reward_std": 0.3791454806923866, + "rewards/accuracy_reward": 0.09821428963914514, + "rewards/format_reward": 0.2790178693830967, + "step": 360 + }, + { + "completion_length": 1734.8438110351562, + "epoch": 0.10783361959525053, + "grad_norm": 0.859386682510376, + "kl": 0.204345703125, + "learning_rate": 9.998345402330006e-07, + "loss": 0.1675, + "reward": 0.2656250111758709, + "reward_std": 0.39694346487522125, + "rewards/accuracy_reward": 0.03125000209547579, + "rewards/format_reward": 0.2343750037252903, + "step": 361 + }, + { + "completion_length": 1634.4442443847656, + "epoch": 0.108132327682772, + "grad_norm": 1.1880249977111816, + "kl": 0.206298828125, + "learning_rate": 9.998215686530048e-07, + "loss": 0.2012, + "reward": 0.357142873108387, + "reward_std": 0.4755028486251831, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.3035714477300644, + "step": 362 + }, + { + "completion_length": 1658.80810546875, + "epoch": 0.10843103577029348, + "grad_norm": 1.3583589792251587, + "kl": 0.19677734375, + "learning_rate": 9.998081077112299e-07, + "loss": 0.2196, + "reward": 0.4531250298023224, + "reward_std": 0.4156462922692299, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.2968750186264515, + "step": 363 + }, + { + "completion_length": 1719.6719665527344, + "epoch": 0.10872974385781495, + "grad_norm": 1.8113089799880981, + "kl": 0.24755859375, + "learning_rate": 9.997941574223196e-07, + "loss": 0.1974, + "reward": 0.4419643059372902, + "reward_std": 0.38093043863773346, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.2522321492433548, + "step": 364 + }, + { + "completion_length": 1652.0915832519531, + "epoch": 0.10902845194533642, + "grad_norm": 1.4652020931243896, + "kl": 0.236572265625, + "learning_rate": 9.997797178014505e-07, + "loss": 0.2173, + "reward": 0.4508928842842579, + "reward_std": 0.48115528374910355, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.3214285895228386, + "step": 365 + }, + { + "completion_length": 1731.5067443847656, + "epoch": 0.10932716003285789, + "grad_norm": 1.1539406776428223, + "kl": 0.185546875, + "learning_rate": 9.99764788864332e-07, + "loss": 0.1546, + "reward": 0.357142873108387, + "reward_std": 0.4016416594386101, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.2901785895228386, + "step": 366 + }, + { + "completion_length": 1768.16748046875, + "epoch": 0.10962586812037936, + "grad_norm": 1.032090425491333, + "kl": 0.207763671875, + "learning_rate": 9.997493706272045e-07, + "loss": 0.1796, + "reward": 0.2700893022119999, + "reward_std": 0.3954578712582588, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.2232142984867096, + "step": 367 + }, + { + "completion_length": 1695.8327026367188, + "epoch": 0.10992457620790083, + "grad_norm": 1.2777067422866821, + "kl": 0.214599609375, + "learning_rate": 9.997334631068419e-07, + "loss": 0.1965, + "reward": 0.3883928880095482, + "reward_std": 0.5499073937535286, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.3169643022119999, + "step": 368 + }, + { + "completion_length": 1775.6429443359375, + "epoch": 0.1102232842954223, + "grad_norm": 1.5486531257629395, + "kl": 0.235595703125, + "learning_rate": 9.9971706632055e-07, + "loss": 0.1568, + "reward": 0.2879464477300644, + "reward_std": 0.4195859208703041, + "rewards/accuracy_reward": 0.035714287078008056, + "rewards/format_reward": 0.2522321455180645, + "step": 369 + }, + { + "completion_length": 1739.1116943359375, + "epoch": 0.11052199238294377, + "grad_norm": 2.4359030723571777, + "kl": 0.31982421875, + "learning_rate": 9.997001802861675e-07, + "loss": 0.1871, + "reward": 0.464285746216774, + "reward_std": 0.486023411154747, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.330357164144516, + "step": 370 + }, + { + "completion_length": 1648.3415832519531, + "epoch": 0.11082070047046524, + "grad_norm": 3.777467727661133, + "kl": 0.4033203125, + "learning_rate": 9.996828050220636e-07, + "loss": 0.2003, + "reward": 0.4553571492433548, + "reward_std": 0.5065404400229454, + "rewards/accuracy_reward": 0.08258928707800806, + "rewards/format_reward": 0.372767873108387, + "step": 371 + }, + { + "completion_length": 1647.2500915527344, + "epoch": 0.1111194085579867, + "grad_norm": 3.8465616703033447, + "kl": 0.6220703125, + "learning_rate": 9.996649405471418e-07, + "loss": 0.1969, + "reward": 0.450892873108387, + "reward_std": 0.48494482040405273, + "rewards/accuracy_reward": 0.040178573690354824, + "rewards/format_reward": 0.4107143059372902, + "step": 372 + }, + { + "completion_length": 1457.7723999023438, + "epoch": 0.11141811664550817, + "grad_norm": 4.265771865844727, + "kl": 0.9677734375, + "learning_rate": 9.996465868808365e-07, + "loss": 0.2384, + "reward": 0.6718750298023224, + "reward_std": 0.48996546119451523, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.5133928880095482, + "step": 373 + }, + { + "completion_length": 1504.3951721191406, + "epoch": 0.11171682473302964, + "grad_norm": 3.9275591373443604, + "kl": 1.1796875, + "learning_rate": 9.996277440431148e-07, + "loss": 0.2047, + "reward": 0.6808036118745804, + "reward_std": 0.525671012699604, + "rewards/accuracy_reward": 0.09821428847499192, + "rewards/format_reward": 0.5825893059372902, + "step": 374 + }, + { + "completion_length": 1367.5736999511719, + "epoch": 0.11201553282055111, + "grad_norm": 2.4771082401275635, + "kl": 1.71875, + "learning_rate": 9.996084120544758e-07, + "loss": 0.1862, + "reward": 0.8660714626312256, + "reward_std": 0.5358760878443718, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.674107164144516, + "step": 375 + }, + { + "completion_length": 1319.29248046875, + "epoch": 0.11231424090807259, + "grad_norm": 2.725893974304199, + "kl": 1.9453125, + "learning_rate": 9.99588590935951e-07, + "loss": 0.191, + "reward": 0.870535746216774, + "reward_std": 0.49552710354328156, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.7343750298023224, + "step": 376 + }, + { + "completion_length": 1274.3348693847656, + "epoch": 0.11261294899559406, + "grad_norm": 4.392379283905029, + "kl": 2.2734375, + "learning_rate": 9.995682807091034e-07, + "loss": 0.2273, + "reward": 0.8504464775323868, + "reward_std": 0.4022684171795845, + "rewards/accuracy_reward": 0.09821429336443543, + "rewards/format_reward": 0.7522321939468384, + "step": 377 + }, + { + "completion_length": 1221.1339721679688, + "epoch": 0.11291165708311553, + "grad_norm": 3.0209462642669678, + "kl": 1.328125, + "learning_rate": 9.99547481396029e-07, + "loss": 0.1508, + "reward": 0.8861607611179352, + "reward_std": 0.44326214492321014, + "rewards/accuracy_reward": 0.11383929336443543, + "rewards/format_reward": 0.7723214775323868, + "step": 378 + }, + { + "completion_length": 1180.4420166015625, + "epoch": 0.113210365170637, + "grad_norm": 1.4654886722564697, + "kl": 1.1416015625, + "learning_rate": 9.99526193019355e-07, + "loss": 0.1072, + "reward": 0.9508928954601288, + "reward_std": 0.3830752447247505, + "rewards/accuracy_reward": 0.10267857392318547, + "rewards/format_reward": 0.848214328289032, + "step": 379 + }, + { + "completion_length": 1232.4955749511719, + "epoch": 0.11350907325815847, + "grad_norm": 1.1756591796875, + "kl": 1.025390625, + "learning_rate": 9.995044156022416e-07, + "loss": 0.1427, + "reward": 0.926339328289032, + "reward_std": 0.4116244316101074, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.8191964626312256, + "step": 380 + }, + { + "completion_length": 1215.2991638183594, + "epoch": 0.11380778134567994, + "grad_norm": 2.030205726623535, + "kl": 1.0654296875, + "learning_rate": 9.9948214916838e-07, + "loss": 0.1347, + "reward": 1.0156250596046448, + "reward_std": 0.3527984395623207, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.8549107611179352, + "step": 381 + }, + { + "completion_length": 1134.2611999511719, + "epoch": 0.11410648943320141, + "grad_norm": 1.0618712902069092, + "kl": 1.0009765625, + "learning_rate": 9.994593937419942e-07, + "loss": 0.1006, + "reward": 0.9799107611179352, + "reward_std": 0.4288306385278702, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.8459821939468384, + "step": 382 + }, + { + "completion_length": 1233.8192596435547, + "epoch": 0.11440519752072287, + "grad_norm": 0.9445304274559021, + "kl": 0.6767578125, + "learning_rate": 9.994361493478399e-07, + "loss": 0.0916, + "reward": 1.0245536118745804, + "reward_std": 0.432583287358284, + "rewards/accuracy_reward": 0.21428572619333863, + "rewards/format_reward": 0.8102678954601288, + "step": 383 + }, + { + "completion_length": 1278.80810546875, + "epoch": 0.11470390560824434, + "grad_norm": 1.8325169086456299, + "kl": 0.58154296875, + "learning_rate": 9.994124160112044e-07, + "loss": 0.1391, + "reward": 1.0401785969734192, + "reward_std": 0.42476417124271393, + "rewards/accuracy_reward": 0.2276785832364112, + "rewards/format_reward": 0.8125000298023224, + "step": 384 + }, + { + "completion_length": 1192.3572387695312, + "epoch": 0.11500261369576581, + "grad_norm": 0.5590088963508606, + "kl": 0.5712890625, + "learning_rate": 9.993881937579075e-07, + "loss": 0.103, + "reward": 1.0022321939468384, + "reward_std": 0.40944279730319977, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.8370536118745804, + "step": 385 + }, + { + "completion_length": 1344.2098693847656, + "epoch": 0.11530132178328728, + "grad_norm": 0.9115962982177734, + "kl": 0.57421875, + "learning_rate": 9.993634826143003e-07, + "loss": 0.0832, + "reward": 1.0178571939468384, + "reward_std": 0.3950372263789177, + "rewards/accuracy_reward": 0.1763392984867096, + "rewards/format_reward": 0.8415178954601288, + "step": 386 + }, + { + "completion_length": 1241.9107360839844, + "epoch": 0.11560002987080875, + "grad_norm": 0.6985265612602234, + "kl": 0.7138671875, + "learning_rate": 9.993382826072668e-07, + "loss": 0.111, + "reward": 1.0424107611179352, + "reward_std": 0.40411342680454254, + "rewards/accuracy_reward": 0.18973215017467737, + "rewards/format_reward": 0.8526786118745804, + "step": 387 + }, + { + "completion_length": 1190.19873046875, + "epoch": 0.11589873795833022, + "grad_norm": 0.8349444270133972, + "kl": 0.6435546875, + "learning_rate": 9.993125937642214e-07, + "loss": 0.0581, + "reward": 1.0401786118745804, + "reward_std": 0.34949304908514023, + "rewards/accuracy_reward": 0.1495535816065967, + "rewards/format_reward": 0.8906250298023224, + "step": 388 + }, + { + "completion_length": 1173.02685546875, + "epoch": 0.1161974460458517, + "grad_norm": 1.2173638343811035, + "kl": 0.673828125, + "learning_rate": 9.992864161131115e-07, + "loss": 0.104, + "reward": 1.1026785969734192, + "reward_std": 0.34115442633628845, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.8883928954601288, + "step": 389 + }, + { + "completion_length": 1273.6295166015625, + "epoch": 0.11649615413337316, + "grad_norm": 1.3414669036865234, + "kl": 0.7822265625, + "learning_rate": 9.992597496824156e-07, + "loss": 0.0689, + "reward": 1.0848214775323868, + "reward_std": 0.3763594925403595, + "rewards/accuracy_reward": 0.2142857275903225, + "rewards/format_reward": 0.870535746216774, + "step": 390 + }, + { + "completion_length": 1150.8772583007812, + "epoch": 0.11679486222089464, + "grad_norm": 3.708460807800293, + "kl": 1.0185546875, + "learning_rate": 9.992325945011443e-07, + "loss": 0.0689, + "reward": 0.9776786267757416, + "reward_std": 0.3158075660467148, + "rewards/accuracy_reward": 0.08482143515720963, + "rewards/format_reward": 0.8928571790456772, + "step": 391 + }, + { + "completion_length": 1107.3594360351562, + "epoch": 0.1170935703084161, + "grad_norm": 2.1011033058166504, + "kl": 0.8408203125, + "learning_rate": 9.992049505988397e-07, + "loss": 0.0079, + "reward": 1.0781250298023224, + "reward_std": 0.2763647027313709, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9084822088479996, + "step": 392 + }, + { + "completion_length": 1268.2969360351562, + "epoch": 0.11739227839593756, + "grad_norm": 1.6898927688598633, + "kl": 0.8037109375, + "learning_rate": 9.991768180055755e-07, + "loss": 0.0776, + "reward": 0.941964328289032, + "reward_std": 0.2882690206170082, + "rewards/accuracy_reward": 0.03571428591385484, + "rewards/format_reward": 0.9062500447034836, + "step": 393 + }, + { + "completion_length": 1190.5000610351562, + "epoch": 0.11769098648345903, + "grad_norm": 1.8050318956375122, + "kl": 0.6181640625, + "learning_rate": 9.991481967519575e-07, + "loss": 0.0778, + "reward": 1.1093750596046448, + "reward_std": 0.3659384623169899, + "rewards/accuracy_reward": 0.2008928693830967, + "rewards/format_reward": 0.9084821939468384, + "step": 394 + }, + { + "completion_length": 1276.9040832519531, + "epoch": 0.1179896945709805, + "grad_norm": 0.8927368521690369, + "kl": 0.6767578125, + "learning_rate": 9.991190868691228e-07, + "loss": 0.071, + "reward": 1.0156250596046448, + "reward_std": 0.2639070972800255, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.9375000447034836, + "step": 395 + }, + { + "completion_length": 1201.4263610839844, + "epoch": 0.11828840265850198, + "grad_norm": 1.0676677227020264, + "kl": 0.7587890625, + "learning_rate": 9.990894883887397e-07, + "loss": 0.0292, + "reward": 1.0491071939468384, + "reward_std": 0.30908485502004623, + "rewards/accuracy_reward": 0.15401786309666932, + "rewards/format_reward": 0.895089328289032, + "step": 396 + }, + { + "completion_length": 1126.6719055175781, + "epoch": 0.11858711074602345, + "grad_norm": 1.3472141027450562, + "kl": 0.6513671875, + "learning_rate": 9.99059401343009e-07, + "loss": 0.0868, + "reward": 1.1607143580913544, + "reward_std": 0.27921096235513687, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.9553571790456772, + "step": 397 + }, + { + "completion_length": 1147.6629943847656, + "epoch": 0.11888581883354492, + "grad_norm": 1.387945294380188, + "kl": 0.63671875, + "learning_rate": 9.990288257646621e-07, + "loss": 0.1024, + "reward": 1.1205357611179352, + "reward_std": 0.3425816744565964, + "rewards/accuracy_reward": 0.20089287124574184, + "rewards/format_reward": 0.91964291036129, + "step": 398 + }, + { + "completion_length": 1177.2634582519531, + "epoch": 0.11918452692106639, + "grad_norm": 0.6497628688812256, + "kl": 0.52099609375, + "learning_rate": 9.989977616869623e-07, + "loss": 0.0526, + "reward": 1.0156250447034836, + "reward_std": 0.28865882381796837, + "rewards/accuracy_reward": 0.09821429196745157, + "rewards/format_reward": 0.9174107611179352, + "step": 399 + }, + { + "completion_length": 1124.8214721679688, + "epoch": 0.11948323500858786, + "grad_norm": 0.46972426772117615, + "kl": 0.455078125, + "learning_rate": 9.989662091437042e-07, + "loss": 0.0011, + "reward": 1.0089286416769028, + "reward_std": 0.2901039719581604, + "rewards/accuracy_reward": 0.10937500675208867, + "rewards/format_reward": 0.8995535969734192, + "step": 400 + }, + { + "completion_length": 1193.8013916015625, + "epoch": 0.11978194309610933, + "grad_norm": 1.0045520067214966, + "kl": 0.48046875, + "learning_rate": 9.989341681692143e-07, + "loss": 0.0399, + "reward": 1.0781250298023224, + "reward_std": 0.2902345508337021, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.9218750298023224, + "step": 401 + }, + { + "completion_length": 1125.60498046875, + "epoch": 0.1200806511836308, + "grad_norm": 1.1198220252990723, + "kl": 0.45361328125, + "learning_rate": 9.989016387983494e-07, + "loss": 0.034, + "reward": 1.0424107611179352, + "reward_std": 0.23471219465136528, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.926339328289032, + "step": 402 + }, + { + "completion_length": 1139.685302734375, + "epoch": 0.12037935927115227, + "grad_norm": 1.1138721704483032, + "kl": 0.5, + "learning_rate": 9.988686210664985e-07, + "loss": -0.0066, + "reward": 1.0133928805589676, + "reward_std": 0.25333142653107643, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.9397321939468384, + "step": 403 + }, + { + "completion_length": 1089.8527374267578, + "epoch": 0.12067806735867373, + "grad_norm": 0.9113104343414307, + "kl": 0.64453125, + "learning_rate": 9.98835115009582e-07, + "loss": 0.0951, + "reward": 1.0156250596046448, + "reward_std": 0.338597796857357, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.9040178954601288, + "step": 404 + }, + { + "completion_length": 1146.2411193847656, + "epoch": 0.1209767754461952, + "grad_norm": 1.748014211654663, + "kl": 0.9248046875, + "learning_rate": 9.988011206640509e-07, + "loss": 0.1264, + "reward": 0.9732143133878708, + "reward_std": 0.31099607050418854, + "rewards/accuracy_reward": 0.08705357694998384, + "rewards/format_reward": 0.8861607611179352, + "step": 405 + }, + { + "completion_length": 1037.2277069091797, + "epoch": 0.12127548353371667, + "grad_norm": 1.235247015953064, + "kl": 0.58740234375, + "learning_rate": 9.987666380668876e-07, + "loss": 0.0725, + "reward": 1.0691964626312256, + "reward_std": 0.26178086176514626, + "rewards/accuracy_reward": 0.13839286542497575, + "rewards/format_reward": 0.9308036118745804, + "step": 406 + }, + { + "completion_length": 1133.3839874267578, + "epoch": 0.12157419162123814, + "grad_norm": 1.7638602256774902, + "kl": 0.64453125, + "learning_rate": 9.98731667255606e-07, + "loss": 0.1041, + "reward": 1.0580357760190964, + "reward_std": 0.32205165922641754, + "rewards/accuracy_reward": 0.16071429592557251, + "rewards/format_reward": 0.8973214626312256, + "step": 407 + }, + { + "completion_length": 1133.0781860351562, + "epoch": 0.12187289970875961, + "grad_norm": 3.8373935222625732, + "kl": 0.82080078125, + "learning_rate": 9.98696208268251e-07, + "loss": 0.1399, + "reward": 1.0379464775323868, + "reward_std": 0.40354814380407333, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.8683036118745804, + "step": 408 + }, + { + "completion_length": 1101.8438110351562, + "epoch": 0.12217160779628108, + "grad_norm": 2.536115884780884, + "kl": 0.634765625, + "learning_rate": 9.986602611433982e-07, + "loss": 0.0948, + "reward": 0.9687500596046448, + "reward_std": 0.26515548676252365, + "rewards/accuracy_reward": 0.05803571571595967, + "rewards/format_reward": 0.910714328289032, + "step": 409 + }, + { + "completion_length": 1148.6518249511719, + "epoch": 0.12247031588380256, + "grad_norm": 3.7508976459503174, + "kl": 0.7470703125, + "learning_rate": 9.986238259201547e-07, + "loss": 0.0527, + "reward": 0.9910714626312256, + "reward_std": 0.32227814197540283, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.8995536118745804, + "step": 410 + }, + { + "completion_length": 1049.5357513427734, + "epoch": 0.12276902397132403, + "grad_norm": 9.27310848236084, + "kl": 0.88916015625, + "learning_rate": 9.985869026381586e-07, + "loss": 0.1226, + "reward": 0.9441964775323868, + "reward_std": 0.31378623843193054, + "rewards/accuracy_reward": 0.03571428661234677, + "rewards/format_reward": 0.9084821939468384, + "step": 411 + }, + { + "completion_length": 953.7924499511719, + "epoch": 0.1230677320588455, + "grad_norm": 5.268216609954834, + "kl": 0.46337890625, + "learning_rate": 9.985494913375785e-07, + "loss": 0.0889, + "reward": 1.0312500298023224, + "reward_std": 0.33468465134501457, + "rewards/accuracy_reward": 0.15178572502918541, + "rewards/format_reward": 0.8794643133878708, + "step": 412 + }, + { + "completion_length": 1068.5781860351562, + "epoch": 0.12336644014636697, + "grad_norm": 6.535355091094971, + "kl": 0.4951171875, + "learning_rate": 9.985115920591146e-07, + "loss": 0.1003, + "reward": 0.9955357611179352, + "reward_std": 0.3469388708472252, + "rewards/accuracy_reward": 0.1004464344587177, + "rewards/format_reward": 0.8950893431901932, + "step": 413 + }, + { + "completion_length": 987.0447082519531, + "epoch": 0.12366514823388844, + "grad_norm": 3.8134427070617676, + "kl": 0.5234375, + "learning_rate": 9.984732048439972e-07, + "loss": 0.0567, + "reward": 1.0915179252624512, + "reward_std": 0.37748579680919647, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.886160746216774, + "step": 414 + }, + { + "completion_length": 942.1295166015625, + "epoch": 0.1239638563214099, + "grad_norm": 2.5719666481018066, + "kl": 0.40380859375, + "learning_rate": 9.984343297339883e-07, + "loss": 0.0453, + "reward": 1.1517857760190964, + "reward_std": 0.18547306396067142, + "rewards/accuracy_reward": 0.20312500977888703, + "rewards/format_reward": 0.9486607611179352, + "step": 415 + }, + { + "completion_length": 1024.2790832519531, + "epoch": 0.12426256440893137, + "grad_norm": 1.5626018047332764, + "kl": 0.56591796875, + "learning_rate": 9.983949667713796e-07, + "loss": 0.0719, + "reward": 0.988839328289032, + "reward_std": 0.3057236075401306, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.8861607611179352, + "step": 416 + }, + { + "completion_length": 1019.9107360839844, + "epoch": 0.12456127249645284, + "grad_norm": 2.170276403427124, + "kl": 0.4453125, + "learning_rate": 9.983551159989946e-07, + "loss": 0.0222, + "reward": 1.0513393431901932, + "reward_std": 0.22048842906951904, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9375000447034836, + "step": 417 + }, + { + "completion_length": 1009.83935546875, + "epoch": 0.12485998058397431, + "grad_norm": 13.768925666809082, + "kl": 1.06640625, + "learning_rate": 9.98314777460187e-07, + "loss": 0.1505, + "reward": 1.006696492433548, + "reward_std": 0.3680913746356964, + "rewards/accuracy_reward": 0.1294642947614193, + "rewards/format_reward": 0.8772321939468384, + "step": 418 + }, + { + "completion_length": 1013.4777221679688, + "epoch": 0.1251586886714958, + "grad_norm": 3.9573822021484375, + "kl": 0.41748046875, + "learning_rate": 9.98273951198841e-07, + "loss": 0.0539, + "reward": 1.0379464775323868, + "reward_std": 0.2970322445034981, + "rewards/accuracy_reward": 0.10267857555299997, + "rewards/format_reward": 0.9352678954601288, + "step": 419 + }, + { + "completion_length": 1070.9866485595703, + "epoch": 0.12545739675901724, + "grad_norm": 3.56783127784729, + "kl": 0.46630859375, + "learning_rate": 9.982326372593718e-07, + "loss": 0.0416, + "reward": 1.0892857760190964, + "reward_std": 0.2734467573463917, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.9241071939468384, + "step": 420 + }, + { + "completion_length": 1020.2009429931641, + "epoch": 0.1257561048465387, + "grad_norm": 1.6686723232269287, + "kl": 0.5068359375, + "learning_rate": 9.981908356867247e-07, + "loss": 0.0957, + "reward": 1.069196492433548, + "reward_std": 0.38053061068058014, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9196428954601288, + "step": 421 + }, + { + "completion_length": 965.6562957763672, + "epoch": 0.12605481293406018, + "grad_norm": 2.723619222640991, + "kl": 0.478515625, + "learning_rate": 9.981485465263759e-07, + "loss": 0.0744, + "reward": 1.1138393580913544, + "reward_std": 0.2907983995974064, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.948660746216774, + "step": 422 + }, + { + "completion_length": 1019.0223541259766, + "epoch": 0.12635352102158165, + "grad_norm": 5.177270412445068, + "kl": 0.873046875, + "learning_rate": 9.981057698243315e-07, + "loss": 0.1259, + "reward": 1.0379464626312256, + "reward_std": 0.29117198288440704, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.9196428954601288, + "step": 423 + }, + { + "completion_length": 1013.9933471679688, + "epoch": 0.12665222910910312, + "grad_norm": 12.171834945678711, + "kl": 0.8232421875, + "learning_rate": 9.980625056271289e-07, + "loss": 0.109, + "reward": 1.053571492433548, + "reward_std": 0.2816132754087448, + "rewards/accuracy_reward": 0.13616071757860482, + "rewards/format_reward": 0.9174107611179352, + "step": 424 + }, + { + "completion_length": 1023.3169860839844, + "epoch": 0.1269509371966246, + "grad_norm": 5.744314193725586, + "kl": 0.9228515625, + "learning_rate": 9.980187539818348e-07, + "loss": 0.1025, + "reward": 1.0022321939468384, + "reward_std": 0.2610950842499733, + "rewards/accuracy_reward": 0.08705357648432255, + "rewards/format_reward": 0.9151786118745804, + "step": 425 + }, + { + "completion_length": 1115.6942596435547, + "epoch": 0.12724964528414606, + "grad_norm": 4.625571250915527, + "kl": 0.6240234375, + "learning_rate": 9.979745149360471e-07, + "loss": 0.0867, + "reward": 1.0669643431901932, + "reward_std": 0.331573061645031, + "rewards/accuracy_reward": 0.18303572316654027, + "rewards/format_reward": 0.8839286118745804, + "step": 426 + }, + { + "completion_length": 1150.3906860351562, + "epoch": 0.12754835337166753, + "grad_norm": 2.0698277950286865, + "kl": 0.3486328125, + "learning_rate": 9.97929788537893e-07, + "loss": 0.0289, + "reward": 1.0580357611179352, + "reward_std": 0.31512513384222984, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.917410746216774, + "step": 427 + }, + { + "completion_length": 1064.2098693847656, + "epoch": 0.127847061459189, + "grad_norm": 4.49570369720459, + "kl": 0.29736328125, + "learning_rate": 9.978845748360312e-07, + "loss": 0.0677, + "reward": 1.0625000596046448, + "reward_std": 0.23731505870819092, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.941964328289032, + "step": 428 + }, + { + "completion_length": 1115.529067993164, + "epoch": 0.12814576954671048, + "grad_norm": 1.6804543733596802, + "kl": 0.287841796875, + "learning_rate": 9.978388738796493e-07, + "loss": 0.0877, + "reward": 0.9821428954601288, + "reward_std": 0.33101607114076614, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.8995536118745804, + "step": 429 + }, + { + "completion_length": 1137.7054138183594, + "epoch": 0.12844447763423195, + "grad_norm": 2.1557087898254395, + "kl": 0.52734375, + "learning_rate": 9.977926857184655e-07, + "loss": 0.1056, + "reward": 0.986607164144516, + "reward_std": 0.33993392437696457, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.8906250447034836, + "step": 430 + }, + { + "completion_length": 1056.1139068603516, + "epoch": 0.12874318572175342, + "grad_norm": 1.0246883630752563, + "kl": 0.46484375, + "learning_rate": 9.977460104027282e-07, + "loss": 0.0913, + "reward": 1.0691964626312256, + "reward_std": 0.27455997094511986, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.9151786118745804, + "step": 431 + }, + { + "completion_length": 1195.138442993164, + "epoch": 0.1290418938092749, + "grad_norm": 1.4098161458969116, + "kl": 0.73388671875, + "learning_rate": 9.97698847983215e-07, + "loss": 0.0897, + "reward": 1.0625000298023224, + "reward_std": 0.29467254132032394, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.9084821790456772, + "step": 432 + }, + { + "completion_length": 1112.5625610351562, + "epoch": 0.12934060189679636, + "grad_norm": 3.756798028945923, + "kl": 0.7880859375, + "learning_rate": 9.976511985112348e-07, + "loss": 0.0677, + "reward": 0.9821428954601288, + "reward_std": 0.26684773806482553, + "rewards/accuracy_reward": 0.05803571501746774, + "rewards/format_reward": 0.9241071790456772, + "step": 433 + }, + { + "completion_length": 1089.2411193847656, + "epoch": 0.12963930998431783, + "grad_norm": 4.463489055633545, + "kl": 0.779296875, + "learning_rate": 9.97603062038625e-07, + "loss": 0.0743, + "reward": 1.0602679252624512, + "reward_std": 0.30329176411032677, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.933035746216774, + "step": 434 + }, + { + "completion_length": 1138.0179138183594, + "epoch": 0.1299380180718393, + "grad_norm": 4.0527262687683105, + "kl": 1.2099609375, + "learning_rate": 9.975544386177537e-07, + "loss": 0.1098, + "reward": 1.131696492433548, + "reward_std": 0.3119664043188095, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.910714328289032, + "step": 435 + }, + { + "completion_length": 1174.5290832519531, + "epoch": 0.13023672615936077, + "grad_norm": 3.8842341899871826, + "kl": 0.97021484375, + "learning_rate": 9.97505328301518e-07, + "loss": 0.0807, + "reward": 0.9754464775323868, + "reward_std": 0.20816821977496147, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.9218750596046448, + "step": 436 + }, + { + "completion_length": 1144.5379638671875, + "epoch": 0.13053543424688224, + "grad_norm": 6.443349838256836, + "kl": 0.70263671875, + "learning_rate": 9.974557311433453e-07, + "loss": 0.1146, + "reward": 0.9464285969734192, + "reward_std": 0.2888457216322422, + "rewards/accuracy_reward": 0.040178571827709675, + "rewards/format_reward": 0.9062500298023224, + "step": 437 + }, + { + "completion_length": 1203.4911499023438, + "epoch": 0.1308341423344037, + "grad_norm": 3.5392961502075195, + "kl": 0.580078125, + "learning_rate": 9.974056471971925e-07, + "loss": 0.1008, + "reward": 1.0915178954601288, + "reward_std": 0.297273151576519, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.91964291036129, + "step": 438 + }, + { + "completion_length": 1090.1585540771484, + "epoch": 0.13113285042192518, + "grad_norm": 4.159813404083252, + "kl": 0.5048828125, + "learning_rate": 9.973550765175463e-07, + "loss": 0.0692, + "reward": 0.9821428805589676, + "reward_std": 0.33540207147598267, + "rewards/accuracy_reward": 0.09151786379516125, + "rewards/format_reward": 0.8906250447034836, + "step": 439 + }, + { + "completion_length": 1066.0000610351562, + "epoch": 0.13143155850944666, + "grad_norm": 2.104637622833252, + "kl": 0.501953125, + "learning_rate": 9.97304019159422e-07, + "loss": 0.084, + "reward": 1.0892857611179352, + "reward_std": 0.29682885482907295, + "rewards/accuracy_reward": 0.18080358300358057, + "rewards/format_reward": 0.9084821790456772, + "step": 440 + }, + { + "completion_length": 1136.6630249023438, + "epoch": 0.13173026659696813, + "grad_norm": 1.6716245412826538, + "kl": 0.7314453125, + "learning_rate": 9.972524751783657e-07, + "loss": 0.0734, + "reward": 1.020089328289032, + "reward_std": 0.2412293255329132, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.93526791036129, + "step": 441 + }, + { + "completion_length": 1116.5469055175781, + "epoch": 0.13202897468448957, + "grad_norm": 1.6085306406021118, + "kl": 0.63232421875, + "learning_rate": 9.972004446304516e-07, + "loss": 0.0573, + "reward": 1.020089328289032, + "reward_std": 0.24245916679501534, + "rewards/accuracy_reward": 0.08928571990691125, + "rewards/format_reward": 0.9308036118745804, + "step": 442 + }, + { + "completion_length": 1080.8795166015625, + "epoch": 0.13232768277201104, + "grad_norm": 3.6749026775360107, + "kl": 0.9921875, + "learning_rate": 9.971479275722843e-07, + "loss": 0.1199, + "reward": 1.0558036118745804, + "reward_std": 0.20891683734953403, + "rewards/accuracy_reward": 0.10937500093132257, + "rewards/format_reward": 0.9464286118745804, + "step": 443 + }, + { + "completion_length": 1024.0335388183594, + "epoch": 0.1326263908595325, + "grad_norm": 2.2253153324127197, + "kl": 1.1376953125, + "learning_rate": 9.97094924060997e-07, + "loss": 0.0716, + "reward": 1.0133928954601288, + "reward_std": 0.2569349594414234, + "rewards/accuracy_reward": 0.08258928684517741, + "rewards/format_reward": 0.9308036118745804, + "step": 444 + }, + { + "completion_length": 1061.935317993164, + "epoch": 0.13292509894705398, + "grad_norm": 7.922972679138184, + "kl": 1.283203125, + "learning_rate": 9.970414341542522e-07, + "loss": 0.1188, + "reward": 1.113839328289032, + "reward_std": 0.26120675168931484, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.9241071790456772, + "step": 445 + }, + { + "completion_length": 1084.1005096435547, + "epoch": 0.13322380703457545, + "grad_norm": 2.286102056503296, + "kl": 0.8876953125, + "learning_rate": 9.969874579102418e-07, + "loss": 0.0742, + "reward": 1.064732164144516, + "reward_std": 0.2731858268380165, + "rewards/accuracy_reward": 0.13169643585570157, + "rewards/format_reward": 0.933035746216774, + "step": 446 + }, + { + "completion_length": 1042.3125457763672, + "epoch": 0.13352251512209692, + "grad_norm": 1.6019304990768433, + "kl": 0.9423828125, + "learning_rate": 9.969329953876866e-07, + "loss": 0.0706, + "reward": 1.1272321939468384, + "reward_std": 0.25584539771080017, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9308036118745804, + "step": 447 + }, + { + "completion_length": 1057.341567993164, + "epoch": 0.1338212232096184, + "grad_norm": 9.805750846862793, + "kl": 1.2431640625, + "learning_rate": 9.968780466458367e-07, + "loss": 0.0681, + "reward": 1.0245536118745804, + "reward_std": 0.2694372795522213, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.9352678954601288, + "step": 448 + }, + { + "completion_length": 1128.0179443359375, + "epoch": 0.13411993129713987, + "grad_norm": 4.188530921936035, + "kl": 0.8876953125, + "learning_rate": 9.968226117444707e-07, + "loss": 0.0795, + "reward": 1.0647322088479996, + "reward_std": 0.2691515237092972, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.9285714626312256, + "step": 449 + }, + { + "completion_length": 1142.2255249023438, + "epoch": 0.13441863938466134, + "grad_norm": 1.334722638130188, + "kl": 0.7236328125, + "learning_rate": 9.967666907438965e-07, + "loss": 0.0805, + "reward": 1.0156250447034836, + "reward_std": 0.2733675055205822, + "rewards/accuracy_reward": 0.10491071618162096, + "rewards/format_reward": 0.9107143431901932, + "step": 450 + }, + { + "completion_length": 1187.2232666015625, + "epoch": 0.1347173474721828, + "grad_norm": 4.0897650718688965, + "kl": 1.3408203125, + "learning_rate": 9.967102837049506e-07, + "loss": 0.1545, + "reward": 0.9821429252624512, + "reward_std": 0.3736622706055641, + "rewards/accuracy_reward": 0.11830358020961285, + "rewards/format_reward": 0.863839328289032, + "step": 451 + }, + { + "completion_length": 1127.1830749511719, + "epoch": 0.13501605555970428, + "grad_norm": 2.2374653816223145, + "kl": 0.8525390625, + "learning_rate": 9.966533906889987e-07, + "loss": 0.0852, + "reward": 1.035714328289032, + "reward_std": 0.31173761561512947, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.9151786118745804, + "step": 452 + }, + { + "completion_length": 1198.2254943847656, + "epoch": 0.13531476364722575, + "grad_norm": 3.7077972888946533, + "kl": 1.0556640625, + "learning_rate": 9.965960117579341e-07, + "loss": 0.1799, + "reward": 1.0089286267757416, + "reward_std": 0.30419784784317017, + "rewards/accuracy_reward": 0.13839286426082253, + "rewards/format_reward": 0.8705357611179352, + "step": 453 + }, + { + "completion_length": 1130.4732971191406, + "epoch": 0.13561347173474722, + "grad_norm": 2.1643166542053223, + "kl": 1.0302734375, + "learning_rate": 9.965381469741798e-07, + "loss": 0.1258, + "reward": 0.9776786416769028, + "reward_std": 0.3497687056660652, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.8549107611179352, + "step": 454 + }, + { + "completion_length": 1195.2500305175781, + "epoch": 0.1359121798222687, + "grad_norm": 4.452880382537842, + "kl": 0.8955078125, + "learning_rate": 9.964797964006871e-07, + "loss": 0.1252, + "reward": 1.0937500298023224, + "reward_std": 0.346342496573925, + "rewards/accuracy_reward": 0.2142857238650322, + "rewards/format_reward": 0.879464328289032, + "step": 455 + }, + { + "completion_length": 1163.4420166015625, + "epoch": 0.13621088790979016, + "grad_norm": 1.6721220016479492, + "kl": 1.15625, + "learning_rate": 9.964209601009357e-07, + "loss": 0.1796, + "reward": 0.9754464775323868, + "reward_std": 0.3987119719386101, + "rewards/accuracy_reward": 0.11383929010480642, + "rewards/format_reward": 0.8616071790456772, + "step": 456 + }, + { + "completion_length": 1209.776840209961, + "epoch": 0.13650959599731163, + "grad_norm": 1.788064956665039, + "kl": 1.68359375, + "learning_rate": 9.963616381389336e-07, + "loss": 0.2072, + "reward": 0.9218750447034836, + "reward_std": 0.41077765077352524, + "rewards/accuracy_reward": 0.08482143003493547, + "rewards/format_reward": 0.8370535969734192, + "step": 457 + }, + { + "completion_length": 1150.90185546875, + "epoch": 0.1368083040848331, + "grad_norm": 1.7725927829742432, + "kl": 1.0654296875, + "learning_rate": 9.963018305792174e-07, + "loss": 0.1405, + "reward": 1.129464328289032, + "reward_std": 0.34189096093177795, + "rewards/accuracy_reward": 0.2455357201397419, + "rewards/format_reward": 0.8839285969734192, + "step": 458 + }, + { + "completion_length": 1252.1808471679688, + "epoch": 0.13710701217235458, + "grad_norm": 1.3212521076202393, + "kl": 1.267578125, + "learning_rate": 9.962415374868516e-07, + "loss": 0.1395, + "reward": 0.879464328289032, + "reward_std": 0.3803185224533081, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.8325893133878708, + "step": 459 + }, + { + "completion_length": 1244.6272583007812, + "epoch": 0.13740572025987605, + "grad_norm": 2.7945077419281006, + "kl": 0.9443359375, + "learning_rate": 9.961807589274297e-07, + "loss": 0.1208, + "reward": 1.0580357611179352, + "reward_std": 0.3474365696310997, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.8973214775323868, + "step": 460 + }, + { + "completion_length": 1241.7924499511719, + "epoch": 0.13770442834739752, + "grad_norm": 2.378653049468994, + "kl": 1.3681640625, + "learning_rate": 9.961194949670722e-07, + "loss": 0.1762, + "reward": 1.1696429252624512, + "reward_std": 0.35622213780879974, + "rewards/accuracy_reward": 0.2946428693830967, + "rewards/format_reward": 0.8750000298023224, + "step": 461 + }, + { + "completion_length": 1264.9888916015625, + "epoch": 0.138003136434919, + "grad_norm": 5.105492115020752, + "kl": 2.271484375, + "learning_rate": 9.960577456724288e-07, + "loss": 0.1964, + "reward": 0.9285714775323868, + "reward_std": 0.39077892899513245, + "rewards/accuracy_reward": 0.11383929383009672, + "rewards/format_reward": 0.8147321790456772, + "step": 462 + }, + { + "completion_length": 1239.7009582519531, + "epoch": 0.13830184452244043, + "grad_norm": 1.1566580533981323, + "kl": 1.27734375, + "learning_rate": 9.959955111106763e-07, + "loss": 0.1824, + "reward": 1.0000000298023224, + "reward_std": 0.37856920063495636, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.8415178954601288, + "step": 463 + }, + { + "completion_length": 1210.997802734375, + "epoch": 0.1386005526099619, + "grad_norm": 2.9241909980773926, + "kl": 0.7490234375, + "learning_rate": 9.959327913495202e-07, + "loss": 0.1515, + "reward": 1.1138393133878708, + "reward_std": 0.37857917696237564, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.8973214775323868, + "step": 464 + }, + { + "completion_length": 1259.5915832519531, + "epoch": 0.13889926069748337, + "grad_norm": 1.2637001276016235, + "kl": 0.68505859375, + "learning_rate": 9.95869586457193e-07, + "loss": 0.0778, + "reward": 1.0915178954601288, + "reward_std": 0.39872801303863525, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.8683035969734192, + "step": 465 + }, + { + "completion_length": 1256.185302734375, + "epoch": 0.13919796878500484, + "grad_norm": 1.082383155822754, + "kl": 0.8212890625, + "learning_rate": 9.958058965024558e-07, + "loss": 0.1043, + "reward": 0.973214328289032, + "reward_std": 0.24244262278079987, + "rewards/accuracy_reward": 0.08035714295692742, + "rewards/format_reward": 0.8928571790456772, + "step": 466 + }, + { + "completion_length": 1330.0245971679688, + "epoch": 0.13949667687252632, + "grad_norm": 1.359284520149231, + "kl": 0.923828125, + "learning_rate": 9.957417215545968e-07, + "loss": 0.0776, + "reward": 1.0066964477300644, + "reward_std": 0.3291664123535156, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.877232164144516, + "step": 467 + }, + { + "completion_length": 1345.8348999023438, + "epoch": 0.13979538496004779, + "grad_norm": 1.1364712715148926, + "kl": 0.900390625, + "learning_rate": 9.95677061683432e-07, + "loss": 0.0883, + "reward": 0.944196492433548, + "reward_std": 0.3156161457300186, + "rewards/accuracy_reward": 0.060267859138548374, + "rewards/format_reward": 0.8839286267757416, + "step": 468 + }, + { + "completion_length": 1365.3772888183594, + "epoch": 0.14009409304756926, + "grad_norm": 1.3673044443130493, + "kl": 0.67578125, + "learning_rate": 9.956119169593055e-07, + "loss": 0.0636, + "reward": 0.9754464626312256, + "reward_std": 0.313346229493618, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.901785746216774, + "step": 469 + }, + { + "completion_length": 1338.0536193847656, + "epoch": 0.14039280113509073, + "grad_norm": 2.9297776222229004, + "kl": 0.69140625, + "learning_rate": 9.955462874530878e-07, + "loss": 0.0507, + "reward": 1.0424107313156128, + "reward_std": 0.27952076494693756, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.9330357611179352, + "step": 470 + }, + { + "completion_length": 1264.2679138183594, + "epoch": 0.1406915092226122, + "grad_norm": 1.4710874557495117, + "kl": 0.69970703125, + "learning_rate": 9.954801732361776e-07, + "loss": 0.0488, + "reward": 1.022321492433548, + "reward_std": 0.2908954955637455, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9196428954601288, + "step": 471 + }, + { + "completion_length": 1288.6473693847656, + "epoch": 0.14099021731013367, + "grad_norm": 1.8629940748214722, + "kl": 0.7138671875, + "learning_rate": 9.954135743805007e-07, + "loss": 0.0691, + "reward": 1.1250000596046448, + "reward_std": 0.33989452198147774, + "rewards/accuracy_reward": 0.22321429662406445, + "rewards/format_reward": 0.9017857611179352, + "step": 472 + }, + { + "completion_length": 1387.1005249023438, + "epoch": 0.14128892539765514, + "grad_norm": 4.959175109863281, + "kl": 0.83984375, + "learning_rate": 9.9534649095851e-07, + "loss": 0.1305, + "reward": 0.9464286118745804, + "reward_std": 0.3590996041893959, + "rewards/accuracy_reward": 0.07142857508733869, + "rewards/format_reward": 0.8750000447034836, + "step": 473 + }, + { + "completion_length": 1332.1295166015625, + "epoch": 0.1415876334851766, + "grad_norm": 4.626978874206543, + "kl": 0.78125, + "learning_rate": 9.952789230431859e-07, + "loss": 0.0919, + "reward": 1.0111607760190964, + "reward_std": 0.30090444907546043, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.897321492433548, + "step": 474 + }, + { + "completion_length": 1372.3817443847656, + "epoch": 0.14188634157269808, + "grad_norm": 3.2861478328704834, + "kl": 0.7626953125, + "learning_rate": 9.952108707080355e-07, + "loss": 0.0809, + "reward": 1.0000000596046448, + "reward_std": 0.322806216776371, + "rewards/accuracy_reward": 0.08482143189758062, + "rewards/format_reward": 0.9151786118745804, + "step": 475 + }, + { + "completion_length": 1376.0491943359375, + "epoch": 0.14218504966021955, + "grad_norm": 4.0432939529418945, + "kl": 1.0478515625, + "learning_rate": 9.95142334027093e-07, + "loss": 0.0743, + "reward": 0.9754464626312256, + "reward_std": 0.3009757846593857, + "rewards/accuracy_reward": 0.058035718742758036, + "rewards/format_reward": 0.9174107611179352, + "step": 476 + }, + { + "completion_length": 1393.8192443847656, + "epoch": 0.14248375774774102, + "grad_norm": 1.9019819498062134, + "kl": 1.169921875, + "learning_rate": 9.950733130749197e-07, + "loss": 0.1224, + "reward": 1.1116071939468384, + "reward_std": 0.28845223784446716, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9174107611179352, + "step": 477 + }, + { + "completion_length": 1327.63623046875, + "epoch": 0.1427824658352625, + "grad_norm": 8.266942977905273, + "kl": 1.20849609375, + "learning_rate": 9.950038079266038e-07, + "loss": 0.086, + "reward": 1.0915178954601288, + "reward_std": 0.25438660755753517, + "rewards/accuracy_reward": 0.16071429336443543, + "rewards/format_reward": 0.9308036118745804, + "step": 478 + }, + { + "completion_length": 1368.6451416015625, + "epoch": 0.14308117392278397, + "grad_norm": 11.748795509338379, + "kl": 1.970703125, + "learning_rate": 9.949338186577601e-07, + "loss": 0.1337, + "reward": 0.9196428954601288, + "reward_std": 0.2800055369734764, + "rewards/accuracy_reward": 0.0334821455180645, + "rewards/format_reward": 0.8861607611179352, + "step": 479 + }, + { + "completion_length": 1340.1518249511719, + "epoch": 0.14337988201030544, + "grad_norm": 3.5724785327911377, + "kl": 0.45361328125, + "learning_rate": 9.948633453445297e-07, + "loss": 0.0857, + "reward": 1.078125074505806, + "reward_std": 0.21099549531936646, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.9531250298023224, + "step": 480 + }, + { + "completion_length": 1406.6384582519531, + "epoch": 0.1436785900978269, + "grad_norm": 7.098046779632568, + "kl": 0.6630859375, + "learning_rate": 9.94792388063581e-07, + "loss": 0.044, + "reward": 1.0558035969734192, + "reward_std": 0.2777559459209442, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9285714775323868, + "step": 481 + }, + { + "completion_length": 1365.8705749511719, + "epoch": 0.14397729818534838, + "grad_norm": 4.726226806640625, + "kl": 0.6083984375, + "learning_rate": 9.94720946892108e-07, + "loss": 0.0636, + "reward": 1.176339328289032, + "reward_std": 0.2325497902929783, + "rewards/accuracy_reward": 0.2321428656578064, + "rewards/format_reward": 0.9441964775323868, + "step": 482 + }, + { + "completion_length": 1350.0514221191406, + "epoch": 0.14427600627286985, + "grad_norm": 1.4464112520217896, + "kl": 0.65673828125, + "learning_rate": 9.946490219078326e-07, + "loss": 0.0897, + "reward": 1.0312500447034836, + "reward_std": 0.22351640090346336, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.95089291036129, + "step": 483 + }, + { + "completion_length": 1311.5090026855469, + "epoch": 0.14457471436039132, + "grad_norm": 1.2163455486297607, + "kl": 0.6669921875, + "learning_rate": 9.945766131890014e-07, + "loss": 0.0681, + "reward": 1.1941964626312256, + "reward_std": 0.26260071620345116, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.9642857611179352, + "step": 484 + }, + { + "completion_length": 1471.55810546875, + "epoch": 0.14487342244791276, + "grad_norm": 6.303879261016846, + "kl": 1.2421875, + "learning_rate": 9.945037208143882e-07, + "loss": 0.0808, + "reward": 1.0334821939468384, + "reward_std": 0.2750401683151722, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.9375000298023224, + "step": 485 + }, + { + "completion_length": 1375.1295471191406, + "epoch": 0.14517213053543424, + "grad_norm": 3.2874319553375244, + "kl": 0.8037109375, + "learning_rate": 9.94430344863293e-07, + "loss": 0.0741, + "reward": 1.0156250149011612, + "reward_std": 0.21585860289633274, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.9531250447034836, + "step": 486 + }, + { + "completion_length": 1279.47998046875, + "epoch": 0.1454708386229557, + "grad_norm": 15.850038528442383, + "kl": 1.4423828125, + "learning_rate": 9.943564854155412e-07, + "loss": 0.1353, + "reward": 1.0245536118745804, + "reward_std": 0.26969054713845253, + "rewards/accuracy_reward": 0.08705357415601611, + "rewards/format_reward": 0.9375000447034836, + "step": 487 + }, + { + "completion_length": 1291.2500610351562, + "epoch": 0.14576954671047718, + "grad_norm": 3.364764928817749, + "kl": 1.11572265625, + "learning_rate": 9.942821425514853e-07, + "loss": 0.1105, + "reward": 0.9799107611179352, + "reward_std": 0.2638242281973362, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.90401791036129, + "step": 488 + }, + { + "completion_length": 1348.1719055175781, + "epoch": 0.14606825479799865, + "grad_norm": 2.6410300731658936, + "kl": 0.6513671875, + "learning_rate": 9.942073163520023e-07, + "loss": 0.0816, + "reward": 1.0625000447034836, + "reward_std": 0.2457689680159092, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.9441964626312256, + "step": 489 + }, + { + "completion_length": 1277.94873046875, + "epoch": 0.14636696288552012, + "grad_norm": 4.275395393371582, + "kl": 0.88671875, + "learning_rate": 9.941320068984961e-07, + "loss": 0.1344, + "reward": 0.975446492433548, + "reward_std": 0.25319115817546844, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.9308036118745804, + "step": 490 + }, + { + "completion_length": 1339.82373046875, + "epoch": 0.1466656709730416, + "grad_norm": 3.7089250087738037, + "kl": 1.001953125, + "learning_rate": 9.940562142728961e-07, + "loss": 0.119, + "reward": 1.0513393431901932, + "reward_std": 0.2268078811466694, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.9441964775323868, + "step": 491 + }, + { + "completion_length": 1275.1183166503906, + "epoch": 0.14696437906056306, + "grad_norm": 2.5529887676239014, + "kl": 1.080078125, + "learning_rate": 9.939799385576573e-07, + "loss": 0.1485, + "reward": 1.1294643580913544, + "reward_std": 0.280772116035223, + "rewards/accuracy_reward": 0.2053571492433548, + "rewards/format_reward": 0.9241071790456772, + "step": 492 + }, + { + "completion_length": 1393.24560546875, + "epoch": 0.14726308714808453, + "grad_norm": 2.78425931930542, + "kl": 1.701171875, + "learning_rate": 9.9390317983576e-07, + "loss": 0.1714, + "reward": 1.0089286267757416, + "reward_std": 0.3176424242556095, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.8816964775323868, + "step": 493 + }, + { + "completion_length": 1288.7969360351562, + "epoch": 0.147561795235606, + "grad_norm": 6.430884838104248, + "kl": 1.05029296875, + "learning_rate": 9.9382593819071e-07, + "loss": 0.1501, + "reward": 1.0669643431901932, + "reward_std": 0.30431773513555527, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9062500447034836, + "step": 494 + }, + { + "completion_length": 1434.4152526855469, + "epoch": 0.14786050332312747, + "grad_norm": 3.5807881355285645, + "kl": 1.92578125, + "learning_rate": 9.93748213706539e-07, + "loss": 0.1881, + "reward": 0.979910746216774, + "reward_std": 0.3765876442193985, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.881696492433548, + "step": 495 + }, + { + "completion_length": 1356.1652221679688, + "epoch": 0.14815921141064894, + "grad_norm": 2.1922528743743896, + "kl": 1.7734375, + "learning_rate": 9.936700064678033e-07, + "loss": 0.1975, + "reward": 0.96651791036129, + "reward_std": 0.3255634382367134, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.87276791036129, + "step": 496 + }, + { + "completion_length": 1438.1272888183594, + "epoch": 0.14845791949817042, + "grad_norm": 2.866145133972168, + "kl": 1.939453125, + "learning_rate": 9.93591316559585e-07, + "loss": 0.1957, + "reward": 1.0558036118745804, + "reward_std": 0.32302603125572205, + "rewards/accuracy_reward": 0.20982143469154835, + "rewards/format_reward": 0.8459821790456772, + "step": 497 + }, + { + "completion_length": 1371.4464721679688, + "epoch": 0.1487566275856919, + "grad_norm": 5.827482223510742, + "kl": 0.9208984375, + "learning_rate": 9.935121440674913e-07, + "loss": 0.153, + "reward": 1.0714285969734192, + "reward_std": 0.3835739344358444, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.8816964626312256, + "step": 498 + }, + { + "completion_length": 1322.2411499023438, + "epoch": 0.14905533567321336, + "grad_norm": 9.317350387573242, + "kl": 1.369140625, + "learning_rate": 9.934324890776533e-07, + "loss": 0.1889, + "reward": 0.9843750447034836, + "reward_std": 0.4234779477119446, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.8459821790456772, + "step": 499 + }, + { + "completion_length": 1417.9464721679688, + "epoch": 0.14935404376073483, + "grad_norm": 3.2848992347717285, + "kl": 1.59375, + "learning_rate": 9.933523516767282e-07, + "loss": 0.1699, + "reward": 0.9285714775323868, + "reward_std": 0.3711507022380829, + "rewards/accuracy_reward": 0.05803571827709675, + "rewards/format_reward": 0.870535746216774, + "step": 500 + }, + { + "completion_length": 1423.0960388183594, + "epoch": 0.1496527518482563, + "grad_norm": 1.8570271730422974, + "kl": 1.87890625, + "learning_rate": 9.932717319518979e-07, + "loss": 0.2006, + "reward": 0.979910746216774, + "reward_std": 0.31800568103790283, + "rewards/accuracy_reward": 0.09151785867288709, + "rewards/format_reward": 0.8883928954601288, + "step": 501 + }, + { + "completion_length": 1422.388427734375, + "epoch": 0.14995145993577777, + "grad_norm": 3.190757989883423, + "kl": 2.07421875, + "learning_rate": 9.931906299908685e-07, + "loss": 0.1553, + "reward": 0.9575893133878708, + "reward_std": 0.3538705036044121, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.8883928954601288, + "step": 502 + }, + { + "completion_length": 1436.1384582519531, + "epoch": 0.15025016802329924, + "grad_norm": 1.218825340270996, + "kl": 1.4921875, + "learning_rate": 9.93109045881871e-07, + "loss": 0.1521, + "reward": 1.0223214626312256, + "reward_std": 0.2914761044085026, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.9151786118745804, + "step": 503 + }, + { + "completion_length": 1425.6094360351562, + "epoch": 0.1505488761108207, + "grad_norm": 2.202352523803711, + "kl": 0.83203125, + "learning_rate": 9.930269797136608e-07, + "loss": 0.1214, + "reward": 1.0312500447034836, + "reward_std": 0.21385271474719048, + "rewards/accuracy_reward": 0.09598215017467737, + "rewards/format_reward": 0.93526791036129, + "step": 504 + }, + { + "completion_length": 1441.3728332519531, + "epoch": 0.15084758419834218, + "grad_norm": 1.405349850654602, + "kl": 0.9931640625, + "learning_rate": 9.929444315755182e-07, + "loss": 0.119, + "reward": 1.066964328289032, + "reward_std": 0.3184109926223755, + "rewards/accuracy_reward": 0.1517857164144516, + "rewards/format_reward": 0.9151786118745804, + "step": 505 + }, + { + "completion_length": 1492.7411499023438, + "epoch": 0.15114629228586363, + "grad_norm": 1.4282315969467163, + "kl": 0.83984375, + "learning_rate": 9.92861401557247e-07, + "loss": 0.08, + "reward": 0.9732143133878708, + "reward_std": 0.2839397192001343, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.9084821939468384, + "step": 506 + }, + { + "completion_length": 1446.2567443847656, + "epoch": 0.1514450003733851, + "grad_norm": 1.3040974140167236, + "kl": 1.0595703125, + "learning_rate": 9.927778897491763e-07, + "loss": 0.1426, + "reward": 1.0625000298023224, + "reward_std": 0.27956460788846016, + "rewards/accuracy_reward": 0.1450892877765, + "rewards/format_reward": 0.9174107611179352, + "step": 507 + }, + { + "completion_length": 1433.7545166015625, + "epoch": 0.15174370846090657, + "grad_norm": 3.684910774230957, + "kl": 1.349609375, + "learning_rate": 9.926938962421582e-07, + "loss": 0.1032, + "reward": 1.0915179252624512, + "reward_std": 0.25911641120910645, + "rewards/accuracy_reward": 0.19419643748551607, + "rewards/format_reward": 0.8973214775323868, + "step": 508 + }, + { + "completion_length": 1551.7277526855469, + "epoch": 0.15204241654842804, + "grad_norm": 1.586422324180603, + "kl": 0.8916015625, + "learning_rate": 9.9260942112757e-07, + "loss": 0.0698, + "reward": 0.9732143431901932, + "reward_std": 0.26888201758265495, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.9107143431901932, + "step": 509 + }, + { + "completion_length": 1454.7478332519531, + "epoch": 0.1523411246359495, + "grad_norm": 2.7146174907684326, + "kl": 0.884765625, + "learning_rate": 9.925244644973115e-07, + "loss": 0.0534, + "reward": 0.973214328289032, + "reward_std": 0.19540737941861153, + "rewards/accuracy_reward": 0.03794643050059676, + "rewards/format_reward": 0.93526791036129, + "step": 510 + }, + { + "completion_length": 1421.0960693359375, + "epoch": 0.15263983272347098, + "grad_norm": 1.0720986127853394, + "kl": 0.53759765625, + "learning_rate": 9.92439026443808e-07, + "loss": 0.0707, + "reward": 1.1584821939468384, + "reward_std": 0.18927239999175072, + "rewards/accuracy_reward": 0.2187500074505806, + "rewards/format_reward": 0.9397321790456772, + "step": 511 + }, + { + "completion_length": 1556.1183471679688, + "epoch": 0.15293854081099245, + "grad_norm": 0.804685115814209, + "kl": 0.8857421875, + "learning_rate": 9.923531070600073e-07, + "loss": 0.088, + "reward": 1.1116071939468384, + "reward_std": 0.2741067372262478, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9241071939468384, + "step": 512 + }, + { + "completion_length": 1467.6451416015625, + "epoch": 0.15323724889851392, + "grad_norm": 1.4674891233444214, + "kl": 0.8173828125, + "learning_rate": 9.922667064393816e-07, + "loss": 0.0701, + "reward": 1.0022321939468384, + "reward_std": 0.23421884700655937, + "rewards/accuracy_reward": 0.08035714854486287, + "rewards/format_reward": 0.9218750298023224, + "step": 513 + }, + { + "completion_length": 1471.5982971191406, + "epoch": 0.1535359569860354, + "grad_norm": 0.5812219977378845, + "kl": 0.54736328125, + "learning_rate": 9.921798246759258e-07, + "loss": 0.0549, + "reward": 1.0669643431901932, + "reward_std": 0.2842535935342312, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9330357611179352, + "step": 514 + }, + { + "completion_length": 1437.7656860351562, + "epoch": 0.15383466507355686, + "grad_norm": 0.8055741786956787, + "kl": 0.7080078125, + "learning_rate": 9.92092461864159e-07, + "loss": 0.0982, + "reward": 1.0334821939468384, + "reward_std": 0.2740909717977047, + "rewards/accuracy_reward": 0.11607143585570157, + "rewards/format_reward": 0.9174107611179352, + "step": 515 + }, + { + "completion_length": 1388.6652526855469, + "epoch": 0.15413337316107834, + "grad_norm": 0.7661802172660828, + "kl": 0.4150390625, + "learning_rate": 9.920046180991236e-07, + "loss": 0.0386, + "reward": 0.973214328289032, + "reward_std": 0.16029666550457478, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.9531250447034836, + "step": 516 + }, + { + "completion_length": 1317.6049499511719, + "epoch": 0.1544320812485998, + "grad_norm": 0.8680989742279053, + "kl": 0.5712890625, + "learning_rate": 9.919162934763848e-07, + "loss": 0.1049, + "reward": 1.1473215073347092, + "reward_std": 0.2379932850599289, + "rewards/accuracy_reward": 0.20089286682195961, + "rewards/format_reward": 0.9464286118745804, + "step": 517 + }, + { + "completion_length": 1462.6607666015625, + "epoch": 0.15473078933612128, + "grad_norm": 0.8988242149353027, + "kl": 0.49560546875, + "learning_rate": 9.918274880920311e-07, + "loss": 0.0582, + "reward": 1.0781250596046448, + "reward_std": 0.17931898683309555, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.9620535969734192, + "step": 518 + }, + { + "completion_length": 1404.7879943847656, + "epoch": 0.15502949742364275, + "grad_norm": 1.300063133239746, + "kl": 0.62255859375, + "learning_rate": 9.917382020426742e-07, + "loss": 0.0837, + "reward": 1.1049107611179352, + "reward_std": 0.29403722286224365, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9464286118745804, + "step": 519 + }, + { + "completion_length": 1474.7723999023438, + "epoch": 0.15532820551116422, + "grad_norm": 0.8322558999061584, + "kl": 0.7998046875, + "learning_rate": 9.916484354254486e-07, + "loss": 0.0992, + "reward": 1.0401786267757416, + "reward_std": 0.2301786206662655, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9375000298023224, + "step": 520 + }, + { + "completion_length": 1438.3438415527344, + "epoch": 0.1556269135986857, + "grad_norm": 2.609076976776123, + "kl": 0.84375, + "learning_rate": 9.915581883380112e-07, + "loss": 0.0576, + "reward": 1.0000000149011612, + "reward_std": 0.17318508587777615, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.9531250298023224, + "step": 521 + }, + { + "completion_length": 1486.6183471679688, + "epoch": 0.15592562168620716, + "grad_norm": 2.104175329208374, + "kl": 0.71484375, + "learning_rate": 9.914674608785422e-07, + "loss": 0.0454, + "reward": 1.0892857909202576, + "reward_std": 0.1654028818011284, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.9687500298023224, + "step": 522 + }, + { + "completion_length": 1436.1741943359375, + "epoch": 0.15622432977372863, + "grad_norm": 2.147237539291382, + "kl": 0.7783203125, + "learning_rate": 9.913762531457444e-07, + "loss": 0.072, + "reward": 1.100446492433548, + "reward_std": 0.19019842520356178, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.9642857611179352, + "step": 523 + }, + { + "completion_length": 1441.5870971679688, + "epoch": 0.1565230378612501, + "grad_norm": 0.9576302170753479, + "kl": 0.7119140625, + "learning_rate": 9.912845652388425e-07, + "loss": 0.0794, + "reward": 1.1093750596046448, + "reward_std": 0.22902600467205048, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9598214775323868, + "step": 524 + }, + { + "completion_length": 1406.6920471191406, + "epoch": 0.15682174594877157, + "grad_norm": 1.399646520614624, + "kl": 0.62109375, + "learning_rate": 9.911923972575844e-07, + "loss": 0.0617, + "reward": 1.0937500596046448, + "reward_std": 0.17796311527490616, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9598214775323868, + "step": 525 + }, + { + "completion_length": 1485.4353332519531, + "epoch": 0.15712045403629304, + "grad_norm": 0.9057657718658447, + "kl": 0.43994140625, + "learning_rate": 9.910997493022395e-07, + "loss": 0.0388, + "reward": 1.069196492433548, + "reward_std": 0.1982332393527031, + "rewards/accuracy_reward": 0.10714286169968545, + "rewards/format_reward": 0.9620535969734192, + "step": 526 + }, + { + "completion_length": 1488.3973999023438, + "epoch": 0.15741916212381452, + "grad_norm": 1.843037724494934, + "kl": 0.321044921875, + "learning_rate": 9.910066214735997e-07, + "loss": 0.0292, + "reward": 1.0357143431901932, + "reward_std": 0.20195871219038963, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.9620535969734192, + "step": 527 + }, + { + "completion_length": 1436.4822082519531, + "epoch": 0.15771787021133596, + "grad_norm": 0.8385283946990967, + "kl": 0.3427734375, + "learning_rate": 9.90913013872979e-07, + "loss": 0.0449, + "reward": 1.176339328289032, + "reward_std": 0.21698268502950668, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.9709821790456772, + "step": 528 + }, + { + "completion_length": 1329.2746276855469, + "epoch": 0.15801657829885743, + "grad_norm": 0.48378559947013855, + "kl": 0.211181640625, + "learning_rate": 9.908189266022135e-07, + "loss": 0.0323, + "reward": 1.1049107611179352, + "reward_std": 0.11884133517742157, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.977678582072258, + "step": 529 + }, + { + "completion_length": 1257.0357666015625, + "epoch": 0.1583152863863789, + "grad_norm": 0.5126137137413025, + "kl": 0.223876953125, + "learning_rate": 9.907243597636606e-07, + "loss": 0.0196, + "reward": 1.0736607611179352, + "reward_std": 0.2091652974486351, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.964285746216774, + "step": 530 + }, + { + "completion_length": 1294.888427734375, + "epoch": 0.15861399447390037, + "grad_norm": 0.5369065999984741, + "kl": 0.18310546875, + "learning_rate": 9.906293134602e-07, + "loss": 0.0365, + "reward": 1.1071428954601288, + "reward_std": 0.15208219084888697, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.9754464775323868, + "step": 531 + }, + { + "completion_length": 1395.8772888183594, + "epoch": 0.15891270256142184, + "grad_norm": 1.17400062084198, + "kl": 0.23974609375, + "learning_rate": 9.905337877952326e-07, + "loss": 0.0195, + "reward": 1.06026791036129, + "reward_std": 0.17903741262853146, + "rewards/accuracy_reward": 0.0937500074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 532 + }, + { + "completion_length": 1242.8482666015625, + "epoch": 0.1592114106489433, + "grad_norm": 0.773529589176178, + "kl": 0.224365234375, + "learning_rate": 9.90437782872681e-07, + "loss": 0.0421, + "reward": 1.145089328289032, + "reward_std": 0.11946445144712925, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.9843750298023224, + "step": 533 + }, + { + "completion_length": 1318.9732666015625, + "epoch": 0.15951011873646478, + "grad_norm": 0.7625306844711304, + "kl": 0.272705078125, + "learning_rate": 9.903412987969894e-07, + "loss": 0.0232, + "reward": 1.0468750596046448, + "reward_std": 0.19940774515271187, + "rewards/accuracy_reward": 0.0937500074505806, + "rewards/format_reward": 0.9531250447034836, + "step": 534 + }, + { + "completion_length": 1281.5915832519531, + "epoch": 0.15980882682398626, + "grad_norm": 0.6075838804244995, + "kl": 0.31494140625, + "learning_rate": 9.902443356731225e-07, + "loss": 0.0162, + "reward": 1.0312500596046448, + "reward_std": 0.16461819224059582, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.9687500596046448, + "step": 535 + }, + { + "completion_length": 1288.0625610351562, + "epoch": 0.16010753491150773, + "grad_norm": 0.8269878625869751, + "kl": 0.3623046875, + "learning_rate": 9.901468936065673e-07, + "loss": 0.0574, + "reward": 1.1897321939468384, + "reward_std": 0.20074428990483284, + "rewards/accuracy_reward": 0.2276785746216774, + "rewards/format_reward": 0.9620536118745804, + "step": 536 + }, + { + "completion_length": 1203.7544860839844, + "epoch": 0.1604062429990292, + "grad_norm": 0.6656994223594666, + "kl": 0.294189453125, + "learning_rate": 9.900489727033305e-07, + "loss": 0.0347, + "reward": 1.1272321939468384, + "reward_std": 0.1822972111403942, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.973214328289032, + "step": 537 + }, + { + "completion_length": 1235.763427734375, + "epoch": 0.16070495108655067, + "grad_norm": 1.676682472229004, + "kl": 0.47021484375, + "learning_rate": 9.899505730699412e-07, + "loss": 0.0334, + "reward": 1.1026785969734192, + "reward_std": 0.14064805209636688, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.9843750298023224, + "step": 538 + }, + { + "completion_length": 1251.8438110351562, + "epoch": 0.16100365917407214, + "grad_norm": 0.47526228427886963, + "kl": 0.34326171875, + "learning_rate": 9.89851694813448e-07, + "loss": 0.0291, + "reward": 1.1272321939468384, + "reward_std": 0.14532578364014626, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9776785969734192, + "step": 539 + }, + { + "completion_length": 1226.3147583007812, + "epoch": 0.1613023672615936, + "grad_norm": 1.0475187301635742, + "kl": 0.46484375, + "learning_rate": 9.89752338041421e-07, + "loss": 0.0433, + "reward": 1.160714328289032, + "reward_std": 0.2203649878501892, + "rewards/accuracy_reward": 0.1830357275903225, + "rewards/format_reward": 0.9776786118745804, + "step": 540 + }, + { + "completion_length": 1120.9442749023438, + "epoch": 0.16160107534911508, + "grad_norm": 1.6763092279434204, + "kl": 0.43408203125, + "learning_rate": 9.896525028619504e-07, + "loss": 0.041, + "reward": 1.0848214626312256, + "reward_std": 0.18245564308017492, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.970982164144516, + "step": 541 + }, + { + "completion_length": 1191.3259582519531, + "epoch": 0.16189978343663655, + "grad_norm": 0.6983539462089539, + "kl": 0.36279296875, + "learning_rate": 9.895521893836474e-07, + "loss": 0.0252, + "reward": 1.0625000596046448, + "reward_std": 0.17666081711649895, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.9642857760190964, + "step": 542 + }, + { + "completion_length": 1299.1295471191406, + "epoch": 0.16219849152415802, + "grad_norm": 0.8807494640350342, + "kl": 0.31591796875, + "learning_rate": 9.89451397715643e-07, + "loss": 0.0239, + "reward": 1.0602678954601288, + "reward_std": 0.12655964866280556, + "rewards/accuracy_reward": 0.07812500349245965, + "rewards/format_reward": 0.9821428954601288, + "step": 543 + }, + { + "completion_length": 1074.200927734375, + "epoch": 0.1624971996116795, + "grad_norm": 0.5811916589736938, + "kl": 0.173828125, + "learning_rate": 9.89350127967589e-07, + "loss": 0.034, + "reward": 1.2566964626312256, + "reward_std": 0.20816698670387268, + "rewards/accuracy_reward": 0.2656250111758709, + "rewards/format_reward": 0.9910714477300644, + "step": 544 + }, + { + "completion_length": 1208.7745666503906, + "epoch": 0.16279590769920096, + "grad_norm": 0.19839231669902802, + "kl": 0.132568359375, + "learning_rate": 9.892483802496565e-07, + "loss": -0.0111, + "reward": 1.0915179252624512, + "reward_std": 0.15926308184862137, + "rewards/accuracy_reward": 0.10491072200238705, + "rewards/format_reward": 0.9866071790456772, + "step": 545 + }, + { + "completion_length": 1177.7120971679688, + "epoch": 0.16309461578672244, + "grad_norm": 0.5080670714378357, + "kl": 0.1131591796875, + "learning_rate": 9.891461546725373e-07, + "loss": -0.0099, + "reward": 1.07589291036129, + "reward_std": 0.14149555750191212, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.9843750447034836, + "step": 546 + }, + { + "completion_length": 1217.6674499511719, + "epoch": 0.1633933238742439, + "grad_norm": 0.5310563445091248, + "kl": 0.16845703125, + "learning_rate": 9.89043451347443e-07, + "loss": 0.0372, + "reward": 1.0959821939468384, + "reward_std": 0.18204668164253235, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9821428954601288, + "step": 547 + }, + { + "completion_length": 1262.8839721679688, + "epoch": 0.16369203196176538, + "grad_norm": 0.2438688427209854, + "kl": 0.1129150390625, + "learning_rate": 9.889402703861042e-07, + "loss": -0.0133, + "reward": 1.0959821939468384, + "reward_std": 0.16922523453831673, + "rewards/accuracy_reward": 0.12276786472648382, + "rewards/format_reward": 0.9732143431901932, + "step": 548 + }, + { + "completion_length": 1018.3549346923828, + "epoch": 0.16399074004928682, + "grad_norm": 0.4260731637477875, + "kl": 0.137451171875, + "learning_rate": 9.88836611900772e-07, + "loss": 0.0154, + "reward": 1.1651786267757416, + "reward_std": 0.2719757407903671, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9687500447034836, + "step": 549 + }, + { + "completion_length": 1224.8103332519531, + "epoch": 0.1642894481368083, + "grad_norm": 0.643839418888092, + "kl": 0.1322021484375, + "learning_rate": 9.887324760042168e-07, + "loss": 0.0115, + "reward": 1.1138393580913544, + "reward_std": 0.14497339446097612, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.98214291036129, + "step": 550 + }, + { + "completion_length": 1232.0022888183594, + "epoch": 0.16458815622432976, + "grad_norm": 1.0960103273391724, + "kl": 0.18798828125, + "learning_rate": 9.886278628097281e-07, + "loss": 0.0294, + "reward": 1.069196492433548, + "reward_std": 0.18338944483548403, + "rewards/accuracy_reward": 0.12500000488944352, + "rewards/format_reward": 0.9441964775323868, + "step": 551 + }, + { + "completion_length": 1253.4822082519531, + "epoch": 0.16488686431185123, + "grad_norm": 0.36903926730155945, + "kl": 0.18896484375, + "learning_rate": 9.885227724311147e-07, + "loss": 0.0171, + "reward": 1.1116071939468384, + "reward_std": 0.1810413273051381, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.9687500298023224, + "step": 552 + }, + { + "completion_length": 1105.8058471679688, + "epoch": 0.1651855723993727, + "grad_norm": 0.5122747421264648, + "kl": 0.162109375, + "learning_rate": 9.884172049827048e-07, + "loss": 0.0173, + "reward": 1.100446492433548, + "reward_std": 0.17265655100345612, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.9776786118745804, + "step": 553 + }, + { + "completion_length": 1145.0714874267578, + "epoch": 0.16548428048689418, + "grad_norm": 1.2025448083877563, + "kl": 0.20263671875, + "learning_rate": 9.883111605793453e-07, + "loss": 0.0424, + "reward": 1.1406250596046448, + "reward_std": 0.25048597529530525, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9642857611179352, + "step": 554 + }, + { + "completion_length": 1182.2991943359375, + "epoch": 0.16578298857441565, + "grad_norm": 0.8587756752967834, + "kl": 0.18359375, + "learning_rate": 9.882046393364024e-07, + "loss": 0.0484, + "reward": 1.1718750298023224, + "reward_std": 0.1606842763721943, + "rewards/accuracy_reward": 0.1919642984867096, + "rewards/format_reward": 0.979910746216774, + "step": 555 + }, + { + "completion_length": 1214.6451110839844, + "epoch": 0.16608169666193712, + "grad_norm": 1.113610863685608, + "kl": 0.256591796875, + "learning_rate": 9.880976413697603e-07, + "loss": 0.037, + "reward": 1.0959822237491608, + "reward_std": 0.1694665253162384, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.973214328289032, + "step": 556 + }, + { + "completion_length": 1199.7545471191406, + "epoch": 0.1663804047494586, + "grad_norm": 0.5755648016929626, + "kl": 0.368408203125, + "learning_rate": 9.879901667958228e-07, + "loss": 0.0164, + "reward": 1.0803571939468384, + "reward_std": 0.22975832968950272, + "rewards/accuracy_reward": 0.12500000791624188, + "rewards/format_reward": 0.9553571790456772, + "step": 557 + }, + { + "completion_length": 1227.57373046875, + "epoch": 0.16667911283698006, + "grad_norm": 1.2063660621643066, + "kl": 0.43994140625, + "learning_rate": 9.878822157315115e-07, + "loss": 0.026, + "reward": 1.0446428954601288, + "reward_std": 0.17777143139392138, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.973214328289032, + "step": 558 + }, + { + "completion_length": 1234.966552734375, + "epoch": 0.16697782092450153, + "grad_norm": 2.9387965202331543, + "kl": 0.6513671875, + "learning_rate": 9.877737882942665e-07, + "loss": 0.0532, + "reward": 1.0446428954601288, + "reward_std": 0.18149437382817268, + "rewards/accuracy_reward": 0.07589286146685481, + "rewards/format_reward": 0.9687500447034836, + "step": 559 + }, + { + "completion_length": 1263.2991638183594, + "epoch": 0.167276529012023, + "grad_norm": 1.873143196105957, + "kl": 0.60107421875, + "learning_rate": 9.876648846020464e-07, + "loss": 0.0428, + "reward": 1.1383928954601288, + "reward_std": 0.21489175409078598, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9709821790456772, + "step": 560 + }, + { + "completion_length": 1146.1986999511719, + "epoch": 0.16757523709954447, + "grad_norm": 1.8659497499465942, + "kl": 0.4150390625, + "learning_rate": 9.875555047733273e-07, + "loss": 0.0323, + "reward": 1.1718750596046448, + "reward_std": 0.21319712325930595, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.9821428805589676, + "step": 561 + }, + { + "completion_length": 1243.5022888183594, + "epoch": 0.16787394518706594, + "grad_norm": 1.4008493423461914, + "kl": 0.50732421875, + "learning_rate": 9.874456489271043e-07, + "loss": 0.0571, + "reward": 1.1272322237491608, + "reward_std": 0.23552844300866127, + "rewards/accuracy_reward": 0.14955357275903225, + "rewards/format_reward": 0.9776786267757416, + "step": 562 + }, + { + "completion_length": 1220.1250305175781, + "epoch": 0.1681726532745874, + "grad_norm": 0.2829214632511139, + "kl": 0.33203125, + "learning_rate": 9.873353171828894e-07, + "loss": 0.0478, + "reward": 1.160714328289032, + "reward_std": 0.16901548579335213, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.98214291036129, + "step": 563 + }, + { + "completion_length": 1194.8259582519531, + "epoch": 0.16847136136210888, + "grad_norm": 0.5096168518066406, + "kl": 0.323486328125, + "learning_rate": 9.87224509660713e-07, + "loss": 0.0489, + "reward": 1.042410746216774, + "reward_std": 0.15010598115622997, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.9620536118745804, + "step": 564 + }, + { + "completion_length": 1237.6786193847656, + "epoch": 0.16877006944963036, + "grad_norm": 0.8683958649635315, + "kl": 0.2509765625, + "learning_rate": 9.871132264811227e-07, + "loss": 0.0391, + "reward": 1.1428572237491608, + "reward_std": 0.1717284433543682, + "rewards/accuracy_reward": 0.16294643469154835, + "rewards/format_reward": 0.979910746216774, + "step": 565 + }, + { + "completion_length": 1109.0692443847656, + "epoch": 0.16906877753715183, + "grad_norm": 0.2414964884519577, + "kl": 0.229736328125, + "learning_rate": 9.870014677651837e-07, + "loss": 0.0405, + "reward": 1.0714286267757416, + "reward_std": 0.18628669530153275, + "rewards/accuracy_reward": 0.10044643376022577, + "rewards/format_reward": 0.9709821790456772, + "step": 566 + }, + { + "completion_length": 1300.2947387695312, + "epoch": 0.1693674856246733, + "grad_norm": 0.6275210380554199, + "kl": 0.1796875, + "learning_rate": 9.868892336344783e-07, + "loss": 0.0337, + "reward": 1.1227678954601288, + "reward_std": 0.23302962258458138, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.9754464775323868, + "step": 567 + }, + { + "completion_length": 1156.825942993164, + "epoch": 0.16966619371219477, + "grad_norm": 0.559144139289856, + "kl": 0.142578125, + "learning_rate": 9.867765242111069e-07, + "loss": 0.0326, + "reward": 1.1250000298023224, + "reward_std": 0.17406975850462914, + "rewards/accuracy_reward": 0.14732143771834671, + "rewards/format_reward": 0.9776786118745804, + "step": 568 + }, + { + "completion_length": 1304.5402526855469, + "epoch": 0.16996490179971624, + "grad_norm": 0.7942366003990173, + "kl": 0.1512451171875, + "learning_rate": 9.866633396176853e-07, + "loss": 0.0447, + "reward": 1.1116071939468384, + "reward_std": 0.2144116684794426, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9665178954601288, + "step": 569 + }, + { + "completion_length": 1187.3326416015625, + "epoch": 0.1702636098872377, + "grad_norm": 0.5213910341262817, + "kl": 0.1380615234375, + "learning_rate": 9.865496799773482e-07, + "loss": 0.0386, + "reward": 1.2098214626312256, + "reward_std": 0.23835323378443718, + "rewards/accuracy_reward": 0.2366071566939354, + "rewards/format_reward": 0.973214328289032, + "step": 570 + }, + { + "completion_length": 1199.4621124267578, + "epoch": 0.17056231797475915, + "grad_norm": 0.2694116234779358, + "kl": 0.165283203125, + "learning_rate": 9.864355454137456e-07, + "loss": 0.0177, + "reward": 1.0781250149011612, + "reward_std": 0.2037312053143978, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.9754464626312256, + "step": 571 + }, + { + "completion_length": 1265.4263916015625, + "epoch": 0.17086102606228062, + "grad_norm": 0.1886771321296692, + "kl": 0.1541748046875, + "learning_rate": 9.863209360510449e-07, + "loss": 0.0262, + "reward": 1.1473214626312256, + "reward_std": 0.17004666104912758, + "rewards/accuracy_reward": 0.17857143841683865, + "rewards/format_reward": 0.9687500447034836, + "step": 572 + }, + { + "completion_length": 1239.5469055175781, + "epoch": 0.1711597341498021, + "grad_norm": 0.44173717498779297, + "kl": 0.181396484375, + "learning_rate": 9.8620585201393e-07, + "loss": 0.0312, + "reward": 1.0424107909202576, + "reward_std": 0.18708964064717293, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.95089291036129, + "step": 573 + }, + { + "completion_length": 1215.8148193359375, + "epoch": 0.17145844223732357, + "grad_norm": 0.4099595546722412, + "kl": 0.1805419921875, + "learning_rate": 9.860902934276005e-07, + "loss": 0.0221, + "reward": 1.051339328289032, + "reward_std": 0.1841468494385481, + "rewards/accuracy_reward": 0.08705357392318547, + "rewards/format_reward": 0.9642857760190964, + "step": 574 + }, + { + "completion_length": 1339.8795166015625, + "epoch": 0.17175715032484504, + "grad_norm": 0.7082549929618835, + "kl": 0.258056640625, + "learning_rate": 9.859742604177734e-07, + "loss": 0.0465, + "reward": 1.0625000298023224, + "reward_std": 0.21493150293827057, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.964285746216774, + "step": 575 + }, + { + "completion_length": 1247.5781707763672, + "epoch": 0.1720558584123665, + "grad_norm": 0.27107033133506775, + "kl": 0.18017578125, + "learning_rate": 9.85857753110681e-07, + "loss": 0.0029, + "reward": 1.0781250298023224, + "reward_std": 0.14763511158525944, + "rewards/accuracy_reward": 0.10491071618162096, + "rewards/format_reward": 0.973214328289032, + "step": 576 + }, + { + "completion_length": 1217.8795166015625, + "epoch": 0.17235456649988798, + "grad_norm": 0.43906936049461365, + "kl": 0.196533203125, + "learning_rate": 9.85740771633072e-07, + "loss": 0.023, + "reward": 1.0736607611179352, + "reward_std": 0.16936879977583885, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.9754464775323868, + "step": 577 + }, + { + "completion_length": 1324.5424499511719, + "epoch": 0.17265327458740945, + "grad_norm": 0.2104405164718628, + "kl": 0.241455078125, + "learning_rate": 9.856233161122105e-07, + "loss": 0.0219, + "reward": 1.102678656578064, + "reward_std": 0.216106865555048, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.96651791036129, + "step": 578 + }, + { + "completion_length": 1237.5313110351562, + "epoch": 0.17295198267493092, + "grad_norm": 0.2844527065753937, + "kl": 0.249755859375, + "learning_rate": 9.855053866758766e-07, + "loss": 0.0516, + "reward": 1.0602678954601288, + "reward_std": 0.16797207109630108, + "rewards/accuracy_reward": 0.08705357694998384, + "rewards/format_reward": 0.973214328289032, + "step": 579 + }, + { + "completion_length": 1258.3326416015625, + "epoch": 0.1732506907624524, + "grad_norm": 0.4761876165866852, + "kl": 0.24560546875, + "learning_rate": 9.853869834523664e-07, + "loss": 0.0461, + "reward": 1.1584822237491608, + "reward_std": 0.19049963541328907, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9754464626312256, + "step": 580 + }, + { + "completion_length": 1326.4777221679688, + "epoch": 0.17354939884997386, + "grad_norm": 0.2800080478191376, + "kl": 0.23388671875, + "learning_rate": 9.852681065704907e-07, + "loss": 0.0196, + "reward": 1.0468750447034836, + "reward_std": 0.2111503854393959, + "rewards/accuracy_reward": 0.08482143143191934, + "rewards/format_reward": 0.9620536118745804, + "step": 581 + }, + { + "completion_length": 1357.3170166015625, + "epoch": 0.17384810693749533, + "grad_norm": 0.2610933184623718, + "kl": 0.221435546875, + "learning_rate": 9.851487561595757e-07, + "loss": 0.0266, + "reward": 1.0892857611179352, + "reward_std": 0.11312206368893385, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.9821428954601288, + "step": 582 + }, + { + "completion_length": 1379.4353332519531, + "epoch": 0.1741468150250168, + "grad_norm": 0.6529478430747986, + "kl": 0.224365234375, + "learning_rate": 9.850289323494634e-07, + "loss": 0.0198, + "reward": 1.1026786267757416, + "reward_std": 0.16376546025276184, + "rewards/accuracy_reward": 0.12946428963914514, + "rewards/format_reward": 0.9732143133878708, + "step": 583 + }, + { + "completion_length": 1330.57373046875, + "epoch": 0.17444552311253828, + "grad_norm": 0.3008766174316406, + "kl": 0.23388671875, + "learning_rate": 9.8490863527051e-07, + "loss": 0.0027, + "reward": 1.0959822237491608, + "reward_std": 0.13247638568282127, + "rewards/accuracy_reward": 0.12276786053553224, + "rewards/format_reward": 0.973214328289032, + "step": 584 + }, + { + "completion_length": 1295.4129943847656, + "epoch": 0.17474423120005975, + "grad_norm": 0.9536386132240295, + "kl": 0.259521484375, + "learning_rate": 9.847878650535871e-07, + "loss": 0.0388, + "reward": 1.0245536267757416, + "reward_std": 0.18714196979999542, + "rewards/accuracy_reward": 0.06473214458674192, + "rewards/format_reward": 0.9598214626312256, + "step": 585 + }, + { + "completion_length": 1300.3973693847656, + "epoch": 0.17504293928758122, + "grad_norm": 1.2829164266586304, + "kl": 0.3681640625, + "learning_rate": 9.846666218300807e-07, + "loss": 0.0282, + "reward": 1.055803656578064, + "reward_std": 0.2144799940288067, + "rewards/accuracy_reward": 0.08705357648432255, + "rewards/format_reward": 0.9687500447034836, + "step": 586 + }, + { + "completion_length": 1217.2813110351562, + "epoch": 0.1753416473751027, + "grad_norm": 0.35842373967170715, + "kl": 0.30078125, + "learning_rate": 9.845449057318917e-07, + "loss": 0.0549, + "reward": 1.084821492433548, + "reward_std": 0.18566505797207355, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.9776785969734192, + "step": 587 + }, + { + "completion_length": 1200.5357666015625, + "epoch": 0.17564035546262416, + "grad_norm": 0.7113931179046631, + "kl": 0.280517578125, + "learning_rate": 9.844227168914351e-07, + "loss": 0.0232, + "reward": 1.1495536267757416, + "reward_std": 0.2187516000121832, + "rewards/accuracy_reward": 0.17410714784637094, + "rewards/format_reward": 0.9754464775323868, + "step": 588 + }, + { + "completion_length": 1228.8482971191406, + "epoch": 0.17593906355014563, + "grad_norm": 0.4294999837875366, + "kl": 0.284912109375, + "learning_rate": 9.843000554416408e-07, + "loss": 0.0343, + "reward": 1.1093750298023224, + "reward_std": 0.1786889098584652, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9776786118745804, + "step": 589 + }, + { + "completion_length": 1301.7254638671875, + "epoch": 0.1762377716376671, + "grad_norm": 0.44451582431793213, + "kl": 0.39306640625, + "learning_rate": 9.841769215159522e-07, + "loss": 0.0373, + "reward": 1.0245535969734192, + "reward_std": 0.18156965263187885, + "rewards/accuracy_reward": 0.06250000488944352, + "rewards/format_reward": 0.9620536118745804, + "step": 590 + }, + { + "completion_length": 1224.0648193359375, + "epoch": 0.17653647972518857, + "grad_norm": 0.5083937644958496, + "kl": 0.31298828125, + "learning_rate": 9.840533152483267e-07, + "loss": -0.0092, + "reward": 1.026785746216774, + "reward_std": 0.17295736446976662, + "rewards/accuracy_reward": 0.06473214761354029, + "rewards/format_reward": 0.9620536118745804, + "step": 591 + }, + { + "completion_length": 1261.0647583007812, + "epoch": 0.17683518781271004, + "grad_norm": 0.5411637425422668, + "kl": 0.2900390625, + "learning_rate": 9.83929236773236e-07, + "loss": 0.0606, + "reward": 1.035714328289032, + "reward_std": 0.1897821668535471, + "rewards/accuracy_reward": 0.06026786123402417, + "rewards/format_reward": 0.9754464626312256, + "step": 592 + }, + { + "completion_length": 1291.6451721191406, + "epoch": 0.17713389590023149, + "grad_norm": 0.5702242851257324, + "kl": 0.290283203125, + "learning_rate": 9.838046862256655e-07, + "loss": 0.0085, + "reward": 1.0602679252624512, + "reward_std": 0.19261028617620468, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.9620535969734192, + "step": 593 + }, + { + "completion_length": 1319.55810546875, + "epoch": 0.17743260398775296, + "grad_norm": 0.39015597105026245, + "kl": 0.260009765625, + "learning_rate": 9.836796637411136e-07, + "loss": 0.036, + "reward": 1.2031250596046448, + "reward_std": 0.23350665345788002, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.9620536118745804, + "step": 594 + }, + { + "completion_length": 1260.6116638183594, + "epoch": 0.17773131207527443, + "grad_norm": 0.5355831384658813, + "kl": 0.241943359375, + "learning_rate": 9.835541694555928e-07, + "loss": 0.0023, + "reward": 1.0379464775323868, + "reward_std": 0.12596400640904903, + "rewards/accuracy_reward": 0.06473214412108064, + "rewards/format_reward": 0.973214328289032, + "step": 595 + }, + { + "completion_length": 1199.5826416015625, + "epoch": 0.1780300201627959, + "grad_norm": 0.7566810250282288, + "kl": 0.253662109375, + "learning_rate": 9.834282035056286e-07, + "loss": 0.0446, + "reward": 1.0803572237491608, + "reward_std": 0.25917258113622665, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9665178954601288, + "step": 596 + }, + { + "completion_length": 1242.88623046875, + "epoch": 0.17832872825031737, + "grad_norm": 0.4737977683544159, + "kl": 0.3125, + "learning_rate": 9.833017660282596e-07, + "loss": 0.0414, + "reward": 1.0446428954601288, + "reward_std": 0.16775646805763245, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.9464286118745804, + "step": 597 + }, + { + "completion_length": 1192.9554138183594, + "epoch": 0.17862743633783884, + "grad_norm": 0.2908605635166168, + "kl": 0.19873046875, + "learning_rate": 9.83174857161037e-07, + "loss": 0.0265, + "reward": 1.1495536267757416, + "reward_std": 0.21395674720406532, + "rewards/accuracy_reward": 0.17187500931322575, + "rewards/format_reward": 0.9776785969734192, + "step": 598 + }, + { + "completion_length": 1143.3705596923828, + "epoch": 0.1789261444253603, + "grad_norm": 0.2605307698249817, + "kl": 0.268310546875, + "learning_rate": 9.830474770420257e-07, + "loss": 0.0484, + "reward": 1.129464328289032, + "reward_std": 0.17208828404545784, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9843750298023224, + "step": 599 + }, + { + "completion_length": 1237.3192749023438, + "epoch": 0.17922485251288178, + "grad_norm": 0.3696470558643341, + "kl": 0.272216796875, + "learning_rate": 9.829196258098025e-07, + "loss": 0.0223, + "reward": 1.0870536267757416, + "reward_std": 0.16757191717624664, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.9709821939468384, + "step": 600 + }, + { + "completion_length": 1367.7322082519531, + "epoch": 0.17952356060040325, + "grad_norm": 0.42966747283935547, + "kl": 0.248291015625, + "learning_rate": 9.82791303603457e-07, + "loss": 0.0228, + "reward": 1.0625000596046448, + "reward_std": 0.20346013642847538, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.988839328289032, + "step": 601 + }, + { + "completion_length": 1226.9219055175781, + "epoch": 0.17982226868792472, + "grad_norm": 0.6020891666412354, + "kl": 0.301513671875, + "learning_rate": 9.826625105625915e-07, + "loss": 0.019, + "reward": 1.1227679252624512, + "reward_std": 0.15017697401344776, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9776786118745804, + "step": 602 + }, + { + "completion_length": 1350.9978637695312, + "epoch": 0.1801209767754462, + "grad_norm": 0.5283297896385193, + "kl": 0.24658203125, + "learning_rate": 9.8253324682732e-07, + "loss": 0.0238, + "reward": 1.0736607611179352, + "reward_std": 0.22183533012866974, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.9665178954601288, + "step": 603 + }, + { + "completion_length": 1222.2389221191406, + "epoch": 0.18041968486296767, + "grad_norm": 0.5753428339958191, + "kl": 0.30078125, + "learning_rate": 9.824035125382686e-07, + "loss": 0.0392, + "reward": 1.064732164144516, + "reward_std": 0.19453714415431023, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.9687500447034836, + "step": 604 + }, + { + "completion_length": 1215.1808776855469, + "epoch": 0.18071839295048914, + "grad_norm": 0.575876772403717, + "kl": 0.21533203125, + "learning_rate": 9.822733078365758e-07, + "loss": 0.021, + "reward": 1.1517857611179352, + "reward_std": 0.18941300548613071, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9754464477300644, + "step": 605 + }, + { + "completion_length": 1299.2478332519531, + "epoch": 0.1810171010380106, + "grad_norm": 0.578353226184845, + "kl": 0.28662109375, + "learning_rate": 9.821426328638914e-07, + "loss": 0.0267, + "reward": 1.0000000298023224, + "reward_std": 0.16942385211586952, + "rewards/accuracy_reward": 0.03571428754366934, + "rewards/format_reward": 0.9642857611179352, + "step": 606 + }, + { + "completion_length": 1294.8750610351562, + "epoch": 0.18131580912553208, + "grad_norm": 0.323152631521225, + "kl": 0.20166015625, + "learning_rate": 9.820114877623768e-07, + "loss": 0.0019, + "reward": 1.162946492433548, + "reward_std": 0.20631814002990723, + "rewards/accuracy_reward": 0.19866072502918541, + "rewards/format_reward": 0.964285746216774, + "step": 607 + }, + { + "completion_length": 1185.96435546875, + "epoch": 0.18161451721305355, + "grad_norm": 0.322354793548584, + "kl": 0.196044921875, + "learning_rate": 9.818798726747055e-07, + "loss": 0.0225, + "reward": 1.0580357760190964, + "reward_std": 0.18654943443834782, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.9754464626312256, + "step": 608 + }, + { + "completion_length": 1295.2366638183594, + "epoch": 0.18191322530057502, + "grad_norm": 0.6346840858459473, + "kl": 0.1943359375, + "learning_rate": 9.817477877440614e-07, + "loss": 0.0232, + "reward": 1.0491072088479996, + "reward_std": 0.18712307140231133, + "rewards/accuracy_reward": 0.07142857555299997, + "rewards/format_reward": 0.9776786118745804, + "step": 609 + }, + { + "completion_length": 1227.7902221679688, + "epoch": 0.1822119333880965, + "grad_norm": 0.4455447196960449, + "kl": 0.153076171875, + "learning_rate": 9.8161523311414e-07, + "loss": 0.0715, + "reward": 1.1183035969734192, + "reward_std": 0.2332894764840603, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.979910746216774, + "step": 610 + }, + { + "completion_length": 1216.9777526855469, + "epoch": 0.18251064147561796, + "grad_norm": 0.46418726444244385, + "kl": 0.157470703125, + "learning_rate": 9.814822089291476e-07, + "loss": 0.0233, + "reward": 1.2254464626312256, + "reward_std": 0.14453382790088654, + "rewards/accuracy_reward": 0.24107143469154835, + "rewards/format_reward": 0.9843750447034836, + "step": 611 + }, + { + "completion_length": 1267.7054138183594, + "epoch": 0.18280934956313943, + "grad_norm": 0.6730651259422302, + "kl": 0.2166748046875, + "learning_rate": 9.81348715333802e-07, + "loss": 0.0117, + "reward": 0.9888393580913544, + "reward_std": 0.16433558985590935, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.9620536118745804, + "step": 612 + }, + { + "completion_length": 1246.3973388671875, + "epoch": 0.1831080576506609, + "grad_norm": 0.35950636863708496, + "kl": 0.1500244140625, + "learning_rate": 9.812147524733309e-07, + "loss": 0.0215, + "reward": 1.1383928805589676, + "reward_std": 0.19760217890143394, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9754464626312256, + "step": 613 + }, + { + "completion_length": 1281.513427734375, + "epoch": 0.18340676573818235, + "grad_norm": 0.3039563298225403, + "kl": 0.17578125, + "learning_rate": 9.810803204934725e-07, + "loss": -0.0072, + "reward": 1.082589328289032, + "reward_std": 0.1861192174255848, + "rewards/accuracy_reward": 0.11160715110599995, + "rewards/format_reward": 0.9709821939468384, + "step": 614 + }, + { + "completion_length": 1278.6964721679688, + "epoch": 0.18370547382570382, + "grad_norm": 0.5138835906982422, + "kl": 0.1483154296875, + "learning_rate": 9.809454195404757e-07, + "loss": 0.0372, + "reward": 1.080357164144516, + "reward_std": 0.21379507333040237, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.9732143133878708, + "step": 615 + }, + { + "completion_length": 1325.9152526855469, + "epoch": 0.1840041819132253, + "grad_norm": 0.26237112283706665, + "kl": 0.231689453125, + "learning_rate": 9.808100497610999e-07, + "loss": 0.0082, + "reward": 1.0781250596046448, + "reward_std": 0.16819228138774633, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.9776786118745804, + "step": 616 + }, + { + "completion_length": 1247.74560546875, + "epoch": 0.18430289000074676, + "grad_norm": 0.28937408328056335, + "kl": 0.236572265625, + "learning_rate": 9.806742113026137e-07, + "loss": 0.0369, + "reward": 1.1160714626312256, + "reward_std": 0.19725866243243217, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.964285746216774, + "step": 617 + }, + { + "completion_length": 1297.8839721679688, + "epoch": 0.18460159808826823, + "grad_norm": 0.6646440625190735, + "kl": 0.200927734375, + "learning_rate": 9.805379043127962e-07, + "loss": 0.0283, + "reward": 1.0825893431901932, + "reward_std": 0.1732947276905179, + "rewards/accuracy_reward": 0.11607143236324191, + "rewards/format_reward": 0.96651791036129, + "step": 618 + }, + { + "completion_length": 1253.3705749511719, + "epoch": 0.1849003061757897, + "grad_norm": 0.9479642510414124, + "kl": 0.2646484375, + "learning_rate": 9.804011289399362e-07, + "loss": 0.022, + "reward": 1.0513392984867096, + "reward_std": 0.19075172021985054, + "rewards/accuracy_reward": 0.08258929010480642, + "rewards/format_reward": 0.9687500447034836, + "step": 619 + }, + { + "completion_length": 1244.2991790771484, + "epoch": 0.18519901426331117, + "grad_norm": 0.7940618991851807, + "kl": 0.25146484375, + "learning_rate": 9.802638853328316e-07, + "loss": 0.037, + "reward": 1.0915178954601288, + "reward_std": 0.20580656453967094, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.957589328289032, + "step": 620 + }, + { + "completion_length": 1235.2031860351562, + "epoch": 0.18549772235083264, + "grad_norm": 0.40722137689590454, + "kl": 0.288330078125, + "learning_rate": 9.801261736407903e-07, + "loss": 0.0592, + "reward": 1.0758928954601288, + "reward_std": 0.2540305219590664, + "rewards/accuracy_reward": 0.11607143748551607, + "rewards/format_reward": 0.9598214775323868, + "step": 621 + }, + { + "completion_length": 1248.4642944335938, + "epoch": 0.18579643043835412, + "grad_norm": 0.6488510966300964, + "kl": 0.263916015625, + "learning_rate": 9.79987994013629e-07, + "loss": 0.0335, + "reward": 1.2098214626312256, + "reward_std": 0.250290147960186, + "rewards/accuracy_reward": 0.2321428656578064, + "rewards/format_reward": 0.9776786118745804, + "step": 622 + }, + { + "completion_length": 1333.1205749511719, + "epoch": 0.1860951385258756, + "grad_norm": 0.6355383992195129, + "kl": 0.3291015625, + "learning_rate": 9.798493466016733e-07, + "loss": 0.0786, + "reward": 1.147321492433548, + "reward_std": 0.26611270010471344, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.964285746216774, + "step": 623 + }, + { + "completion_length": 1237.9308471679688, + "epoch": 0.18639384661339706, + "grad_norm": 0.4132673144340515, + "kl": 0.34521484375, + "learning_rate": 9.797102315557585e-07, + "loss": 0.0618, + "reward": 1.1450893431901932, + "reward_std": 0.2643628530204296, + "rewards/accuracy_reward": 0.18750000838190317, + "rewards/format_reward": 0.957589328289032, + "step": 624 + }, + { + "completion_length": 1335.5424499511719, + "epoch": 0.18669255470091853, + "grad_norm": 0.38135743141174316, + "kl": 0.37353515625, + "learning_rate": 9.79570649027228e-07, + "loss": 0.0327, + "reward": 1.0401786267757416, + "reward_std": 0.1773921176791191, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.9687500447034836, + "step": 625 + }, + { + "completion_length": 1270.8103332519531, + "epoch": 0.18699126278844, + "grad_norm": 0.45521965622901917, + "kl": 0.317626953125, + "learning_rate": 9.794305991679336e-07, + "loss": 0.0015, + "reward": 1.1049107611179352, + "reward_std": 0.17849841341376305, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.9799107760190964, + "step": 626 + }, + { + "completion_length": 1166.0937805175781, + "epoch": 0.18728997087596147, + "grad_norm": 0.41722315549850464, + "kl": 0.3779296875, + "learning_rate": 9.79290082130236e-07, + "loss": 0.0608, + "reward": 1.1138393580913544, + "reward_std": 0.19377162121236324, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.9687500298023224, + "step": 627 + }, + { + "completion_length": 1309.2188110351562, + "epoch": 0.18758867896348294, + "grad_norm": 0.8479337692260742, + "kl": 0.3291015625, + "learning_rate": 9.79149098067004e-07, + "loss": 0.0347, + "reward": 1.0580357611179352, + "reward_std": 0.1869816854596138, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.979910746216774, + "step": 628 + }, + { + "completion_length": 1272.5402221679688, + "epoch": 0.1878873870510044, + "grad_norm": 0.6709154844284058, + "kl": 0.39794921875, + "learning_rate": 9.790076471316147e-07, + "loss": 0.025, + "reward": 1.145089328289032, + "reward_std": 0.22262870520353317, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.957589328289032, + "step": 629 + }, + { + "completion_length": 1273.4844360351562, + "epoch": 0.18818609513852588, + "grad_norm": 0.7587952017784119, + "kl": 0.333251953125, + "learning_rate": 9.788657294779523e-07, + "loss": 0.0192, + "reward": 1.0558036267757416, + "reward_std": 0.24978552758693695, + "rewards/accuracy_reward": 0.10044643189758062, + "rewards/format_reward": 0.9553571939468384, + "step": 630 + }, + { + "completion_length": 1362.9710388183594, + "epoch": 0.18848480322604735, + "grad_norm": 0.40250054001808167, + "kl": 0.244873046875, + "learning_rate": 9.787233452604096e-07, + "loss": 0.0462, + "reward": 1.0691964775323868, + "reward_std": 0.20680873841047287, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.9709821790456772, + "step": 631 + }, + { + "completion_length": 1317.1004943847656, + "epoch": 0.18878351131356882, + "grad_norm": 0.44976967573165894, + "kl": 0.16796875, + "learning_rate": 9.785804946338869e-07, + "loss": 0.0232, + "reward": 1.1540179252624512, + "reward_std": 0.2474312260746956, + "rewards/accuracy_reward": 0.1852678693830967, + "rewards/format_reward": 0.9687500298023224, + "step": 632 + }, + { + "completion_length": 1308.3795166015625, + "epoch": 0.1890822194010903, + "grad_norm": 0.8760434985160828, + "kl": 0.211669921875, + "learning_rate": 9.78437177753791e-07, + "loss": 0.0202, + "reward": 1.0781250596046448, + "reward_std": 0.2557285539805889, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.957589328289032, + "step": 633 + }, + { + "completion_length": 1305.5380249023438, + "epoch": 0.18938092748861177, + "grad_norm": 0.38861945271492004, + "kl": 0.183349609375, + "learning_rate": 9.782933947760374e-07, + "loss": 0.0098, + "reward": 1.0870536267757416, + "reward_std": 0.15617814287543297, + "rewards/accuracy_reward": 0.10714286123402417, + "rewards/format_reward": 0.979910746216774, + "step": 634 + }, + { + "completion_length": 1298.4397583007812, + "epoch": 0.18967963557613324, + "grad_norm": 0.26431599259376526, + "kl": 0.1787109375, + "learning_rate": 9.781491458570475e-07, + "loss": 0.0165, + "reward": 1.0111607611179352, + "reward_std": 0.13085000962018967, + "rewards/accuracy_reward": 0.03125000232830644, + "rewards/format_reward": 0.9799107313156128, + "step": 635 + }, + { + "completion_length": 1352.0380554199219, + "epoch": 0.18997834366365468, + "grad_norm": 0.4009881615638733, + "kl": 0.1883544921875, + "learning_rate": 9.780044311537501e-07, + "loss": 0.0441, + "reward": 1.1183036267757416, + "reward_std": 0.1642962135374546, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9933035969734192, + "step": 636 + }, + { + "completion_length": 1462.7634582519531, + "epoch": 0.19027705175117615, + "grad_norm": 1.469849944114685, + "kl": 0.282958984375, + "learning_rate": 9.778592508235805e-07, + "loss": 0.0462, + "reward": 1.040178656578064, + "reward_std": 0.19976119697093964, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.973214328289032, + "step": 637 + }, + { + "completion_length": 1388.6764221191406, + "epoch": 0.19057575983869762, + "grad_norm": 0.4059983491897583, + "kl": 0.230712890625, + "learning_rate": 9.777136050244806e-07, + "loss": 0.0145, + "reward": 1.1250000298023224, + "reward_std": 0.20252640172839165, + "rewards/accuracy_reward": 0.15848215017467737, + "rewards/format_reward": 0.96651791036129, + "step": 638 + }, + { + "completion_length": 1418.43310546875, + "epoch": 0.1908744679262191, + "grad_norm": 0.45313286781311035, + "kl": 0.1796875, + "learning_rate": 9.775674939148988e-07, + "loss": 0.0213, + "reward": 1.1361607611179352, + "reward_std": 0.15026312787085772, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.979910746216774, + "step": 639 + }, + { + "completion_length": 1278.6183471679688, + "epoch": 0.19117317601374056, + "grad_norm": 1.0950175523757935, + "kl": 0.197998046875, + "learning_rate": 9.774209176537901e-07, + "loss": 0.0542, + "reward": 1.1562500596046448, + "reward_std": 0.2591100446879864, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9598214626312256, + "step": 640 + }, + { + "completion_length": 1484.8103637695312, + "epoch": 0.19147188410126204, + "grad_norm": 0.2669847309589386, + "kl": 0.192138671875, + "learning_rate": 9.772738764006144e-07, + "loss": 0.0111, + "reward": 1.082589328289032, + "reward_std": 0.14367903117090464, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.988839328289032, + "step": 641 + }, + { + "completion_length": 1398.6339721679688, + "epoch": 0.1917705921887835, + "grad_norm": 0.4528186321258545, + "kl": 0.285888671875, + "learning_rate": 9.771263703153388e-07, + "loss": 0.0305, + "reward": 1.073660746216774, + "reward_std": 0.23231955617666245, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.9531250298023224, + "step": 642 + }, + { + "completion_length": 1338.2656555175781, + "epoch": 0.19206930027630498, + "grad_norm": 0.9498985409736633, + "kl": 0.267822265625, + "learning_rate": 9.76978399558435e-07, + "loss": 0.017, + "reward": 1.1138393580913544, + "reward_std": 0.11709905974566936, + "rewards/accuracy_reward": 0.13169643841683865, + "rewards/format_reward": 0.9821429252624512, + "step": 643 + }, + { + "completion_length": 1381.4486999511719, + "epoch": 0.19236800836382645, + "grad_norm": 0.6449251174926758, + "kl": 0.30615234375, + "learning_rate": 9.768299642908808e-07, + "loss": 0.0247, + "reward": 1.082589328289032, + "reward_std": 0.21509434282779694, + "rewards/accuracy_reward": 0.12723215110599995, + "rewards/format_reward": 0.9553571939468384, + "step": 644 + }, + { + "completion_length": 1458.4822082519531, + "epoch": 0.19266671645134792, + "grad_norm": 1.0814958810806274, + "kl": 0.32177734375, + "learning_rate": 9.766810646741595e-07, + "loss": 0.0456, + "reward": 1.0937500298023224, + "reward_std": 0.24606654793024063, + "rewards/accuracy_reward": 0.14732143515720963, + "rewards/format_reward": 0.9464286118745804, + "step": 645 + }, + { + "completion_length": 1433.0826416015625, + "epoch": 0.1929654245388694, + "grad_norm": 0.43246808648109436, + "kl": 0.29052734375, + "learning_rate": 9.76531700870259e-07, + "loss": 0.0191, + "reward": 1.1674107611179352, + "reward_std": 0.2252841778099537, + "rewards/accuracy_reward": 0.2075893022119999, + "rewards/format_reward": 0.9598214775323868, + "step": 646 + }, + { + "completion_length": 1399.8594360351562, + "epoch": 0.19326413262639086, + "grad_norm": 0.5529734492301941, + "kl": 0.33544921875, + "learning_rate": 9.763818730416724e-07, + "loss": 0.016, + "reward": 1.0424107611179352, + "reward_std": 0.1958207879215479, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.957589328289032, + "step": 647 + }, + { + "completion_length": 1379.7411193847656, + "epoch": 0.19356284071391233, + "grad_norm": 0.7161087393760681, + "kl": 0.26123046875, + "learning_rate": 9.76231581351398e-07, + "loss": 0.0316, + "reward": 1.1294643580913544, + "reward_std": 0.14968914724886417, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9821428954601288, + "step": 648 + }, + { + "completion_length": 1502.1139526367188, + "epoch": 0.1938615488014338, + "grad_norm": 0.4280702769756317, + "kl": 0.291748046875, + "learning_rate": 9.760808259629383e-07, + "loss": 0.0177, + "reward": 1.1473214626312256, + "reward_std": 0.206262469291687, + "rewards/accuracy_reward": 0.1741071455180645, + "rewards/format_reward": 0.973214328289032, + "step": 649 + }, + { + "completion_length": 1429.6451416015625, + "epoch": 0.19416025688895527, + "grad_norm": 0.4773162007331848, + "kl": 0.30126953125, + "learning_rate": 9.759296070403001e-07, + "loss": 0.0512, + "reward": 1.1450893133878708, + "reward_std": 0.17421112023293972, + "rewards/accuracy_reward": 0.1763392877765, + "rewards/format_reward": 0.9687500447034836, + "step": 650 + }, + { + "completion_length": 1495.9822387695312, + "epoch": 0.19445896497647674, + "grad_norm": 0.5269002914428711, + "kl": 0.40185546875, + "learning_rate": 9.757779247479953e-07, + "loss": 0.044, + "reward": 1.1026786267757416, + "reward_std": 0.22883527353405952, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.957589328289032, + "step": 651 + }, + { + "completion_length": 1435.1942749023438, + "epoch": 0.19475767306399822, + "grad_norm": 0.637759268283844, + "kl": 0.334228515625, + "learning_rate": 9.75625779251039e-07, + "loss": 0.029, + "reward": 1.0736607313156128, + "reward_std": 0.20163467526435852, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.9620536267757416, + "step": 652 + }, + { + "completion_length": 1455.7545471191406, + "epoch": 0.1950563811515197, + "grad_norm": 0.5906383395195007, + "kl": 0.250244140625, + "learning_rate": 9.754731707149508e-07, + "loss": 0.014, + "reward": 1.1093750298023224, + "reward_std": 0.18001266196370125, + "rewards/accuracy_reward": 0.1361607164144516, + "rewards/format_reward": 0.9732143133878708, + "step": 653 + }, + { + "completion_length": 1444.5848999023438, + "epoch": 0.19535508923904116, + "grad_norm": 0.8268594741821289, + "kl": 0.263427734375, + "learning_rate": 9.753200993057534e-07, + "loss": 0.037, + "reward": 1.0334821939468384, + "reward_std": 0.22238649427890778, + "rewards/accuracy_reward": 0.0758928582072258, + "rewards/format_reward": 0.9575893133878708, + "step": 654 + }, + { + "completion_length": 1393.2143249511719, + "epoch": 0.19565379732656263, + "grad_norm": 0.43138736486434937, + "kl": 0.30908203125, + "learning_rate": 9.751665651899742e-07, + "loss": 0.019, + "reward": 1.0066964775323868, + "reward_std": 0.14027886558324099, + "rewards/accuracy_reward": 0.03125000232830644, + "rewards/format_reward": 0.9754464775323868, + "step": 655 + }, + { + "completion_length": 1415.716552734375, + "epoch": 0.1959525054140841, + "grad_norm": 0.34911856055259705, + "kl": 0.334716796875, + "learning_rate": 9.750125685346426e-07, + "loss": 0.0436, + "reward": 1.07589291036129, + "reward_std": 0.19575418531894684, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.9553571939468384, + "step": 656 + }, + { + "completion_length": 1400.4442443847656, + "epoch": 0.19625121350160554, + "grad_norm": 0.5464935898780823, + "kl": 0.24658203125, + "learning_rate": 9.748581095072922e-07, + "loss": 0.0085, + "reward": 1.131696492433548, + "reward_std": 0.18657976761460304, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9843750447034836, + "step": 657 + }, + { + "completion_length": 1433.8527526855469, + "epoch": 0.196549921589127, + "grad_norm": 0.8025379776954651, + "kl": 0.318359375, + "learning_rate": 9.747031882759594e-07, + "loss": 0.0307, + "reward": 1.058035746216774, + "reward_std": 0.13653122447431087, + "rewards/accuracy_reward": 0.08705357671715319, + "rewards/format_reward": 0.9709821790456772, + "step": 658 + }, + { + "completion_length": 1351.1741638183594, + "epoch": 0.19684862967664848, + "grad_norm": 0.32972270250320435, + "kl": 0.237548828125, + "learning_rate": 9.74547805009183e-07, + "loss": 0.0296, + "reward": 1.0892857313156128, + "reward_std": 0.10725559387356043, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.9799107611179352, + "step": 659 + }, + { + "completion_length": 1398.1719665527344, + "epoch": 0.19714733776416996, + "grad_norm": 0.40788185596466064, + "kl": 0.247802734375, + "learning_rate": 9.74391959876005e-07, + "loss": 0.0328, + "reward": 1.0825893431901932, + "reward_std": 0.13719242066144943, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.9843750447034836, + "step": 660 + }, + { + "completion_length": 1414.38623046875, + "epoch": 0.19744604585169143, + "grad_norm": 0.40811091661453247, + "kl": 0.1763916015625, + "learning_rate": 9.742356530459693e-07, + "loss": 0.0407, + "reward": 1.267857164144516, + "reward_std": 0.1850689873099327, + "rewards/accuracy_reward": 0.2879464440047741, + "rewards/format_reward": 0.9799107611179352, + "step": 661 + }, + { + "completion_length": 1415.8795166015625, + "epoch": 0.1977447539392129, + "grad_norm": 0.35666874051094055, + "kl": 0.187255859375, + "learning_rate": 9.74078884689123e-07, + "loss": 0.0408, + "reward": 1.0937500894069672, + "reward_std": 0.1634386833757162, + "rewards/accuracy_reward": 0.10267857694998384, + "rewards/format_reward": 0.9910714626312256, + "step": 662 + }, + { + "completion_length": 1514.9755249023438, + "epoch": 0.19804346202673437, + "grad_norm": 0.5733624696731567, + "kl": 0.1893310546875, + "learning_rate": 9.73921654976014e-07, + "loss": 0.0292, + "reward": 1.051339328289032, + "reward_std": 0.14151668176054955, + "rewards/accuracy_reward": 0.06919643329456449, + "rewards/format_reward": 0.9821428805589676, + "step": 663 + }, + { + "completion_length": 1345.1473693847656, + "epoch": 0.19834217011425584, + "grad_norm": 0.41925185918807983, + "kl": 0.2135009765625, + "learning_rate": 9.737639640776933e-07, + "loss": -0.017, + "reward": 1.0491071939468384, + "reward_std": 0.14310216903686523, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.9732143133878708, + "step": 664 + }, + { + "completion_length": 1345.1183471679688, + "epoch": 0.1986408782017773, + "grad_norm": 0.2998250126838684, + "kl": 0.1427001953125, + "learning_rate": 9.73605812165713e-07, + "loss": 0.0068, + "reward": 1.0937500298023224, + "reward_std": 0.14412999339401722, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9910714626312256, + "step": 665 + }, + { + "completion_length": 1296.7589721679688, + "epoch": 0.19893958628929878, + "grad_norm": 0.49948763847351074, + "kl": 0.1298828125, + "learning_rate": 9.73447199412127e-07, + "loss": 0.0661, + "reward": 1.1852678954601288, + "reward_std": 0.22465557605028152, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9732143431901932, + "step": 666 + }, + { + "completion_length": 1415.6898193359375, + "epoch": 0.19923829437682025, + "grad_norm": 0.5931330323219299, + "kl": 0.135009765625, + "learning_rate": 9.732881259894902e-07, + "loss": 0.0307, + "reward": 1.0334821939468384, + "reward_std": 0.1366529632359743, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.979910746216774, + "step": 667 + }, + { + "completion_length": 1311.2947082519531, + "epoch": 0.19953700246434172, + "grad_norm": 0.6752407550811768, + "kl": 0.177490234375, + "learning_rate": 9.73128592070859e-07, + "loss": 0.0506, + "reward": 1.2187500298023224, + "reward_std": 0.19793901033699512, + "rewards/accuracy_reward": 0.2455357238650322, + "rewards/format_reward": 0.9732143133878708, + "step": 668 + }, + { + "completion_length": 1346.6853332519531, + "epoch": 0.1998357105518632, + "grad_norm": 0.4761853814125061, + "kl": 0.1904296875, + "learning_rate": 9.729685978297907e-07, + "loss": 0.0171, + "reward": 1.1919643580913544, + "reward_std": 0.16058603022247553, + "rewards/accuracy_reward": 0.20982143585570157, + "rewards/format_reward": 0.9821428954601288, + "step": 669 + }, + { + "completion_length": 1229.9018249511719, + "epoch": 0.20013441863938466, + "grad_norm": 0.33615192770957947, + "kl": 0.224853515625, + "learning_rate": 9.728081434403437e-07, + "loss": 0.0339, + "reward": 1.0803571939468384, + "reward_std": 0.13986695278435946, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.9843750447034836, + "step": 670 + }, + { + "completion_length": 1299.2857666015625, + "epoch": 0.20043312672690614, + "grad_norm": 0.5702298879623413, + "kl": 0.2373046875, + "learning_rate": 9.726472290770759e-07, + "loss": 0.0362, + "reward": 1.1093750596046448, + "reward_std": 0.15218563936650753, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.9754464775323868, + "step": 671 + }, + { + "completion_length": 1261.5268859863281, + "epoch": 0.2007318348144276, + "grad_norm": 0.35207465291023254, + "kl": 0.188232421875, + "learning_rate": 9.724858549150467e-07, + "loss": 0.0102, + "reward": 1.1584821939468384, + "reward_std": 0.1273135431110859, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9910714477300644, + "step": 672 + }, + { + "completion_length": 1280.575927734375, + "epoch": 0.20103054290194908, + "grad_norm": 0.3622780740261078, + "kl": 0.278076171875, + "learning_rate": 9.723240211298156e-07, + "loss": 0.0545, + "reward": 1.113839328289032, + "reward_std": 0.22411945462226868, + "rewards/accuracy_reward": 0.15178571827709675, + "rewards/format_reward": 0.9620536267757416, + "step": 673 + }, + { + "completion_length": 1119.5647888183594, + "epoch": 0.20132925098947055, + "grad_norm": 0.4669269323348999, + "kl": 0.16259765625, + "learning_rate": 9.721617278974417e-07, + "loss": -0.0096, + "reward": 1.147321492433548, + "reward_std": 0.14639472216367722, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.9888393133878708, + "step": 674 + }, + { + "completion_length": 1258.4665832519531, + "epoch": 0.20162795907699202, + "grad_norm": 0.30494555830955505, + "kl": 0.1822509765625, + "learning_rate": 9.719989753944842e-07, + "loss": 0.0019, + "reward": 1.0535714626312256, + "reward_std": 0.16777766961604357, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.9821428954601288, + "step": 675 + }, + { + "completion_length": 1193.9397888183594, + "epoch": 0.2019266671645135, + "grad_norm": 0.41471701860427856, + "kl": 0.230712890625, + "learning_rate": 9.718357637980016e-07, + "loss": 0.0064, + "reward": 1.1361607760190964, + "reward_std": 0.23803110793232918, + "rewards/accuracy_reward": 0.16294643376022577, + "rewards/format_reward": 0.973214328289032, + "step": 676 + }, + { + "completion_length": 1131.1183471679688, + "epoch": 0.20222537525203496, + "grad_norm": 0.7453064918518066, + "kl": 0.262451171875, + "learning_rate": 9.716720932855526e-07, + "loss": 0.064, + "reward": 1.1183035969734192, + "reward_std": 0.23222136311233044, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.9687500149011612, + "step": 677 + }, + { + "completion_length": 1137.1138916015625, + "epoch": 0.20252408333955643, + "grad_norm": 0.5402852296829224, + "kl": 0.1746826171875, + "learning_rate": 9.715079640351942e-07, + "loss": 0.0198, + "reward": 1.1227678954601288, + "reward_std": 0.12923245877027512, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9709821790456772, + "step": 678 + }, + { + "completion_length": 1154.3482666015625, + "epoch": 0.20282279142707788, + "grad_norm": 0.5095847845077515, + "kl": 0.2255859375, + "learning_rate": 9.713433762254833e-07, + "loss": -0.0059, + "reward": 1.1026786267757416, + "reward_std": 0.22524181380867958, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.9709821939468384, + "step": 679 + }, + { + "completion_length": 1312.669677734375, + "epoch": 0.20312149951459935, + "grad_norm": 0.37477266788482666, + "kl": 0.2783203125, + "learning_rate": 9.711783300354749e-07, + "loss": 0.0598, + "reward": 1.1897321939468384, + "reward_std": 0.23433616757392883, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.973214328289032, + "step": 680 + }, + { + "completion_length": 1170.9866638183594, + "epoch": 0.20342020760212082, + "grad_norm": 0.39825305342674255, + "kl": 0.249755859375, + "learning_rate": 9.710128256447235e-07, + "loss": 0.0146, + "reward": 1.1450893580913544, + "reward_std": 0.1959129236638546, + "rewards/accuracy_reward": 0.1830357164144516, + "rewards/format_reward": 0.9620536267757416, + "step": 681 + }, + { + "completion_length": 1157.654052734375, + "epoch": 0.2037189156896423, + "grad_norm": 0.47639045119285583, + "kl": 0.2724609375, + "learning_rate": 9.708468632332817e-07, + "loss": 0.061, + "reward": 1.1316964626312256, + "reward_std": 0.2173631601035595, + "rewards/accuracy_reward": 0.17410714738070965, + "rewards/format_reward": 0.957589328289032, + "step": 682 + }, + { + "completion_length": 1164.2121276855469, + "epoch": 0.20401762377716376, + "grad_norm": 0.8722822070121765, + "kl": 0.33349609375, + "learning_rate": 9.706804429816998e-07, + "loss": 0.0282, + "reward": 1.1160714626312256, + "reward_std": 0.13137936405837536, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9754464775323868, + "step": 683 + }, + { + "completion_length": 1210.7879943847656, + "epoch": 0.20431633186468523, + "grad_norm": 0.9933639168739319, + "kl": 0.285888671875, + "learning_rate": 9.705135650710271e-07, + "loss": -0.0059, + "reward": 1.1316964626312256, + "reward_std": 0.1953103132545948, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9709821790456772, + "step": 684 + }, + { + "completion_length": 1121.6138610839844, + "epoch": 0.2046150399522067, + "grad_norm": 0.4797816276550293, + "kl": 0.254150390625, + "learning_rate": 9.703462296828106e-07, + "loss": 0.0084, + "reward": 1.100446492433548, + "reward_std": 0.23829663172364235, + "rewards/accuracy_reward": 0.14062500488944352, + "rewards/format_reward": 0.9598214626312256, + "step": 685 + }, + { + "completion_length": 1182.6138916015625, + "epoch": 0.20491374803972817, + "grad_norm": 0.34714198112487793, + "kl": 0.254150390625, + "learning_rate": 9.701784369990944e-07, + "loss": 0.0169, + "reward": 1.1562500298023224, + "reward_std": 0.20262368395924568, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.9821428954601288, + "step": 686 + }, + { + "completion_length": 1188.0179138183594, + "epoch": 0.20521245612724964, + "grad_norm": 0.52170729637146, + "kl": 0.25048828125, + "learning_rate": 9.700101872024206e-07, + "loss": 0.0345, + "reward": 1.0848214626312256, + "reward_std": 0.21394951082766056, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.973214328289032, + "step": 687 + }, + { + "completion_length": 1123.4085235595703, + "epoch": 0.2055111642147711, + "grad_norm": 0.6311262845993042, + "kl": 0.24169921875, + "learning_rate": 9.698414804758287e-07, + "loss": 0.0453, + "reward": 1.1562500596046448, + "reward_std": 0.18179473653435707, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9754464775323868, + "step": 688 + }, + { + "completion_length": 1250.3817138671875, + "epoch": 0.20580987230229258, + "grad_norm": 0.8250027894973755, + "kl": 0.29296875, + "learning_rate": 9.69672317002855e-07, + "loss": 0.044, + "reward": 1.1250000298023224, + "reward_std": 0.18686655908823013, + "rewards/accuracy_reward": 0.14732143515720963, + "rewards/format_reward": 0.9776785969734192, + "step": 689 + }, + { + "completion_length": 1102.5290832519531, + "epoch": 0.20610858038981406, + "grad_norm": 0.5484457015991211, + "kl": 0.2490234375, + "learning_rate": 9.695026969675323e-07, + "loss": 0.0118, + "reward": 1.129464328289032, + "reward_std": 0.21187419444322586, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.973214328289032, + "step": 690 + }, + { + "completion_length": 1298.2098999023438, + "epoch": 0.20640728847733553, + "grad_norm": 0.8792422413825989, + "kl": 0.2880859375, + "learning_rate": 9.693326205543913e-07, + "loss": 0.0532, + "reward": 1.1629464775323868, + "reward_std": 0.22049614414572716, + "rewards/accuracy_reward": 0.2142857313156128, + "rewards/format_reward": 0.9486607611179352, + "step": 691 + }, + { + "completion_length": 1097.7277221679688, + "epoch": 0.206705996564857, + "grad_norm": 0.3795260190963745, + "kl": 0.23974609375, + "learning_rate": 9.691620879484581e-07, + "loss": 0.0058, + "reward": 1.0424107611179352, + "reward_std": 0.1347555024549365, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.9776786118745804, + "step": 692 + }, + { + "completion_length": 1122.2254943847656, + "epoch": 0.20700470465237847, + "grad_norm": 0.5092126131057739, + "kl": 0.23779296875, + "learning_rate": 9.689910993352554e-07, + "loss": 0.0322, + "reward": 1.1473214626312256, + "reward_std": 0.21466820687055588, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.9687500298023224, + "step": 693 + }, + { + "completion_length": 1230.6741638183594, + "epoch": 0.20730341273989994, + "grad_norm": 0.42483794689178467, + "kl": 0.31982421875, + "learning_rate": 9.688196549008023e-07, + "loss": 0.0596, + "reward": 1.104910746216774, + "reward_std": 0.19742115773260593, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9598214626312256, + "step": 694 + }, + { + "completion_length": 1081.0045318603516, + "epoch": 0.2076021208274214, + "grad_norm": 0.4799787402153015, + "kl": 0.2666015625, + "learning_rate": 9.686477548316135e-07, + "loss": 0.0372, + "reward": 1.1607143431901932, + "reward_std": 0.1689788829535246, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9642857611179352, + "step": 695 + }, + { + "completion_length": 1152.7745971679688, + "epoch": 0.20790082891494288, + "grad_norm": 0.6201736927032471, + "kl": 0.31640625, + "learning_rate": 9.684753993146991e-07, + "loss": 0.0356, + "reward": 1.0736607611179352, + "reward_std": 0.2417495995759964, + "rewards/accuracy_reward": 0.10937500465661287, + "rewards/format_reward": 0.9642857760190964, + "step": 696 + }, + { + "completion_length": 1116.5960388183594, + "epoch": 0.20819953700246435, + "grad_norm": 0.7709305882453918, + "kl": 0.3218994140625, + "learning_rate": 9.683025885375654e-07, + "loss": 0.0322, + "reward": 1.1071428954601288, + "reward_std": 0.24924412742257118, + "rewards/accuracy_reward": 0.15625000465661287, + "rewards/format_reward": 0.9508928805589676, + "step": 697 + }, + { + "completion_length": 1204.6786499023438, + "epoch": 0.20849824508998582, + "grad_norm": 0.7005147933959961, + "kl": 0.37060546875, + "learning_rate": 9.681293226882134e-07, + "loss": 0.0253, + "reward": 1.0558036118745804, + "reward_std": 0.2184532769024372, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.9531250298023224, + "step": 698 + }, + { + "completion_length": 1108.6964721679688, + "epoch": 0.2087969531775073, + "grad_norm": 0.5732868909835815, + "kl": 0.29052734375, + "learning_rate": 9.679556019551392e-07, + "loss": 0.0424, + "reward": 1.095982164144516, + "reward_std": 0.2779364660382271, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9508928954601288, + "step": 699 + }, + { + "completion_length": 1222.4598693847656, + "epoch": 0.20909566126502874, + "grad_norm": 0.4809533357620239, + "kl": 0.40478515625, + "learning_rate": 9.677814265273344e-07, + "loss": 0.0662, + "reward": 1.0535714775323868, + "reward_std": 0.18855388462543488, + "rewards/accuracy_reward": 0.09151785937137902, + "rewards/format_reward": 0.9620535969734192, + "step": 700 + }, + { + "completion_length": 1065.3214874267578, + "epoch": 0.2093943693525502, + "grad_norm": 0.7767879366874695, + "kl": 0.287353515625, + "learning_rate": 9.676067965942844e-07, + "loss": 0.0501, + "reward": 1.147321492433548, + "reward_std": 0.21325494721531868, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.9620536118745804, + "step": 701 + }, + { + "completion_length": 1165.575927734375, + "epoch": 0.20969307744007168, + "grad_norm": 0.8032437562942505, + "kl": 0.34619140625, + "learning_rate": 9.674317123459696e-07, + "loss": 0.0125, + "reward": 1.0178571939468384, + "reward_std": 0.16643687151372433, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.9665178954601288, + "step": 702 + }, + { + "completion_length": 1163.227783203125, + "epoch": 0.20999178552759315, + "grad_norm": 1.0926896333694458, + "kl": 0.345703125, + "learning_rate": 9.672561739728643e-07, + "loss": 0.0758, + "reward": 1.1272321939468384, + "reward_std": 0.2826094776391983, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.9620535969734192, + "step": 703 + }, + { + "completion_length": 1104.857177734375, + "epoch": 0.21029049361511462, + "grad_norm": 0.41994842886924744, + "kl": 0.37646484375, + "learning_rate": 9.670801816659375e-07, + "loss": 0.0342, + "reward": 1.0379464775323868, + "reward_std": 0.19245322234928608, + "rewards/accuracy_reward": 0.08035714458674192, + "rewards/format_reward": 0.957589328289032, + "step": 704 + }, + { + "completion_length": 1229.0625915527344, + "epoch": 0.2105892017026361, + "grad_norm": 1.0494149923324585, + "kl": 0.6103515625, + "learning_rate": 9.669037356166511e-07, + "loss": 0.0564, + "reward": 1.0156250596046448, + "reward_std": 0.31731636077165604, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.9352678954601288, + "step": 705 + }, + { + "completion_length": 1193.5603332519531, + "epoch": 0.21088790979015756, + "grad_norm": 0.5039858818054199, + "kl": 0.41845703125, + "learning_rate": 9.667268360169616e-07, + "loss": 0.0333, + "reward": 1.1741071939468384, + "reward_std": 0.24085534363985062, + "rewards/accuracy_reward": 0.20535715483129025, + "rewards/format_reward": 0.9687500298023224, + "step": 706 + }, + { + "completion_length": 1187.5067138671875, + "epoch": 0.21118661787767903, + "grad_norm": 0.8673931360244751, + "kl": 0.416015625, + "learning_rate": 9.665494830593177e-07, + "loss": 0.031, + "reward": 0.973214328289032, + "reward_std": 0.11368322186172009, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.9709821790456772, + "step": 707 + }, + { + "completion_length": 1169.5670471191406, + "epoch": 0.2114853259652005, + "grad_norm": 0.6827961206436157, + "kl": 0.404296875, + "learning_rate": 9.663716769366627e-07, + "loss": 0.0645, + "reward": 1.1473214626312256, + "reward_std": 0.29358232393860817, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9531250447034836, + "step": 708 + }, + { + "completion_length": 1161.8281860351562, + "epoch": 0.21178403405272198, + "grad_norm": 1.0160317420959473, + "kl": 0.40234375, + "learning_rate": 9.66193417842432e-07, + "loss": 0.0333, + "reward": 1.1495536118745804, + "reward_std": 0.26594989746809006, + "rewards/accuracy_reward": 0.18750000838190317, + "rewards/format_reward": 0.9620536267757416, + "step": 709 + }, + { + "completion_length": 1226.1607666015625, + "epoch": 0.21208274214024345, + "grad_norm": 0.9042441248893738, + "kl": 0.396484375, + "learning_rate": 9.66014705970554e-07, + "loss": 0.026, + "reward": 1.1205357611179352, + "reward_std": 0.20471647009253502, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.964285746216774, + "step": 710 + }, + { + "completion_length": 1139.26123046875, + "epoch": 0.21238145022776492, + "grad_norm": 0.3679109215736389, + "kl": 0.223388671875, + "learning_rate": 9.658355415154498e-07, + "loss": 0.0276, + "reward": 1.0892857611179352, + "reward_std": 0.23130539059638977, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9665178954601288, + "step": 711 + }, + { + "completion_length": 1106.2344360351562, + "epoch": 0.2126801583152864, + "grad_norm": 0.724004864692688, + "kl": 0.21240234375, + "learning_rate": 9.656559246720327e-07, + "loss": 0.0444, + "reward": 1.158482164144516, + "reward_std": 0.18073931336402893, + "rewards/accuracy_reward": 0.1852678705472499, + "rewards/format_reward": 0.9732143133878708, + "step": 712 + }, + { + "completion_length": 1214.232177734375, + "epoch": 0.21297886640280786, + "grad_norm": 1.8972872495651245, + "kl": 0.376953125, + "learning_rate": 9.65475855635708e-07, + "loss": 0.0806, + "reward": 1.1071429252624512, + "reward_std": 0.22319333627820015, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9441964626312256, + "step": 713 + }, + { + "completion_length": 1265.966552734375, + "epoch": 0.21327757449032933, + "grad_norm": 0.40284276008605957, + "kl": 0.308349609375, + "learning_rate": 9.652953346023737e-07, + "loss": 0.0213, + "reward": 1.0468750596046448, + "reward_std": 0.223524060100317, + "rewards/accuracy_reward": 0.08928571594879031, + "rewards/format_reward": 0.957589328289032, + "step": 714 + }, + { + "completion_length": 1242.9330749511719, + "epoch": 0.2135762825778508, + "grad_norm": 0.6260563731193542, + "kl": 0.229736328125, + "learning_rate": 9.651143617684185e-07, + "loss": 0.0712, + "reward": 1.082589328289032, + "reward_std": 0.21610532328486443, + "rewards/accuracy_reward": 0.11830358067527413, + "rewards/format_reward": 0.964285746216774, + "step": 715 + }, + { + "completion_length": 1215.6250610351562, + "epoch": 0.21387499066537227, + "grad_norm": 0.46311137080192566, + "kl": 0.273193359375, + "learning_rate": 9.649329373307232e-07, + "loss": 0.0204, + "reward": 1.1138393133878708, + "reward_std": 0.1840495467185974, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9620536118745804, + "step": 716 + }, + { + "completion_length": 1178.1072082519531, + "epoch": 0.21417369875289374, + "grad_norm": 0.5690425634384155, + "kl": 0.28564453125, + "learning_rate": 9.6475106148666e-07, + "loss": 0.0441, + "reward": 1.082589328289032, + "reward_std": 0.1677497737109661, + "rewards/accuracy_reward": 0.10937500419095159, + "rewards/format_reward": 0.973214328289032, + "step": 717 + }, + { + "completion_length": 1050.1094360351562, + "epoch": 0.21447240684041521, + "grad_norm": 0.3618701696395874, + "kl": 0.250732421875, + "learning_rate": 9.645687344340918e-07, + "loss": 0.0347, + "reward": 1.1138393580913544, + "reward_std": 0.2511085942387581, + "rewards/accuracy_reward": 0.15178572479635477, + "rewards/format_reward": 0.9620536118745804, + "step": 718 + }, + { + "completion_length": 1180.2879943847656, + "epoch": 0.21477111492793668, + "grad_norm": 0.2981600761413574, + "kl": 0.32958984375, + "learning_rate": 9.643859563713726e-07, + "loss": 0.0144, + "reward": 1.082589328289032, + "reward_std": 0.20353077352046967, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.9598214775323868, + "step": 719 + }, + { + "completion_length": 1172.0781860351562, + "epoch": 0.21506982301545816, + "grad_norm": 0.5722825527191162, + "kl": 0.26953125, + "learning_rate": 9.64202727497347e-07, + "loss": 0.0378, + "reward": 1.0580357611179352, + "reward_std": 0.1829272210597992, + "rewards/accuracy_reward": 0.08035714784637094, + "rewards/format_reward": 0.9776785969734192, + "step": 720 + }, + { + "completion_length": 1203.8616638183594, + "epoch": 0.21536853110297963, + "grad_norm": 0.44905519485473633, + "kl": 0.394775390625, + "learning_rate": 9.640190480113503e-07, + "loss": 0.0249, + "reward": 1.0669643133878708, + "reward_std": 0.172220129519701, + "rewards/accuracy_reward": 0.11383928940631449, + "rewards/format_reward": 0.9531250447034836, + "step": 721 + }, + { + "completion_length": 1208.5558471679688, + "epoch": 0.21566723919050107, + "grad_norm": 0.476447194814682, + "kl": 0.3662109375, + "learning_rate": 9.638349181132077e-07, + "loss": 0.0122, + "reward": 1.0468750298023224, + "reward_std": 0.1942717619240284, + "rewards/accuracy_reward": 0.08705357275903225, + "rewards/format_reward": 0.9598214626312256, + "step": 722 + }, + { + "completion_length": 1190.8750610351562, + "epoch": 0.21596594727802254, + "grad_norm": 0.42390692234039307, + "kl": 0.291015625, + "learning_rate": 9.63650338003234e-07, + "loss": 0.0454, + "reward": 1.1696429252624512, + "reward_std": 0.21702659502625465, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9687500298023224, + "step": 723 + }, + { + "completion_length": 1140.7411499023438, + "epoch": 0.216264655365544, + "grad_norm": 0.9799686074256897, + "kl": 0.38720703125, + "learning_rate": 9.634653078822348e-07, + "loss": 0.0169, + "reward": 1.1607143580913544, + "reward_std": 0.21985067427158356, + "rewards/accuracy_reward": 0.18080358393490314, + "rewards/format_reward": 0.979910746216774, + "step": 724 + }, + { + "completion_length": 1147.6920166015625, + "epoch": 0.21656336345306548, + "grad_norm": 0.42583614587783813, + "kl": 0.28662109375, + "learning_rate": 9.632798279515047e-07, + "loss": 0.014, + "reward": 1.0714286267757416, + "reward_std": 0.15830791927874088, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.9665179252624512, + "step": 725 + }, + { + "completion_length": 1177.5692749023438, + "epoch": 0.21686207154058695, + "grad_norm": 0.41759952902793884, + "kl": 0.32421875, + "learning_rate": 9.630938984128276e-07, + "loss": 0.0334, + "reward": 1.116071492433548, + "reward_std": 0.1837641093879938, + "rewards/accuracy_reward": 0.14955358067527413, + "rewards/format_reward": 0.9665178954601288, + "step": 726 + }, + { + "completion_length": 1157.1518096923828, + "epoch": 0.21716077962810842, + "grad_norm": 0.41637787222862244, + "kl": 0.323486328125, + "learning_rate": 9.629075194684763e-07, + "loss": 0.0529, + "reward": 1.0892857313156128, + "reward_std": 0.2204374447464943, + "rewards/accuracy_reward": 0.12723215110599995, + "rewards/format_reward": 0.9620536118745804, + "step": 727 + }, + { + "completion_length": 1147.8995971679688, + "epoch": 0.2174594877156299, + "grad_norm": 0.5764189958572388, + "kl": 0.311279296875, + "learning_rate": 9.627206913212134e-07, + "loss": 0.0264, + "reward": 1.0937500298023224, + "reward_std": 0.17879304755479097, + "rewards/accuracy_reward": 0.11607143026776612, + "rewards/format_reward": 0.977678582072258, + "step": 728 + }, + { + "completion_length": 1149.4576416015625, + "epoch": 0.21775819580315137, + "grad_norm": 0.7647913098335266, + "kl": 0.287353515625, + "learning_rate": 9.62533414174289e-07, + "loss": 0.0177, + "reward": 1.1651786267757416, + "reward_std": 0.24118343368172646, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.96651791036129, + "step": 729 + }, + { + "completion_length": 1148.013427734375, + "epoch": 0.21805690389067284, + "grad_norm": 0.9369654059410095, + "kl": 0.30712890625, + "learning_rate": 9.62345688231443e-07, + "loss": 0.0388, + "reward": 1.113839328289032, + "reward_std": 0.27449931204319, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9441964626312256, + "step": 730 + }, + { + "completion_length": 1085.651840209961, + "epoch": 0.2183556119781943, + "grad_norm": 0.35036274790763855, + "kl": 0.226318359375, + "learning_rate": 9.621575136969023e-07, + "loss": 0.014, + "reward": 1.04464291036129, + "reward_std": 0.1721072606742382, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.9709821790456772, + "step": 731 + }, + { + "completion_length": 1066.6741638183594, + "epoch": 0.21865432006571578, + "grad_norm": 0.4994291067123413, + "kl": 0.2607421875, + "learning_rate": 9.619688907753823e-07, + "loss": 0.0412, + "reward": 1.1473214626312256, + "reward_std": 0.18667588010430336, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9776786118745804, + "step": 732 + }, + { + "completion_length": 1090.1272735595703, + "epoch": 0.21895302815323725, + "grad_norm": 0.6877834796905518, + "kl": 0.282470703125, + "learning_rate": 9.617798196720866e-07, + "loss": 0.0484, + "reward": 1.162946492433548, + "reward_std": 0.2226906456053257, + "rewards/accuracy_reward": 0.1986607313156128, + "rewards/format_reward": 0.964285746216774, + "step": 733 + }, + { + "completion_length": 1092.5625610351562, + "epoch": 0.21925173624075872, + "grad_norm": 0.46793970465660095, + "kl": 0.27392578125, + "learning_rate": 9.615903005927056e-07, + "loss": 0.0305, + "reward": 1.1071429401636124, + "reward_std": 0.21622544713318348, + "rewards/accuracy_reward": 0.1361607164144516, + "rewards/format_reward": 0.9709821939468384, + "step": 734 + }, + { + "completion_length": 1158.9330749511719, + "epoch": 0.2195504443282802, + "grad_norm": 0.8050515651702881, + "kl": 0.38525390625, + "learning_rate": 9.61400333743418e-07, + "loss": 0.0288, + "reward": 1.127232164144516, + "reward_std": 0.223913736641407, + "rewards/accuracy_reward": 0.16964286309666932, + "rewards/format_reward": 0.957589328289032, + "step": 735 + }, + { + "completion_length": 1144.0915985107422, + "epoch": 0.21984915241580166, + "grad_norm": 0.6616784334182739, + "kl": 0.34765625, + "learning_rate": 9.612099193308889e-07, + "loss": 0.0287, + "reward": 1.0892857611179352, + "reward_std": 0.22108809277415276, + "rewards/accuracy_reward": 0.12053572200238705, + "rewards/format_reward": 0.9687500447034836, + "step": 736 + }, + { + "completion_length": 1139.8013916015625, + "epoch": 0.22014786050332313, + "grad_norm": 0.7102949023246765, + "kl": 0.40380859375, + "learning_rate": 9.610190575622702e-07, + "loss": 0.053, + "reward": 1.0870536267757416, + "reward_std": 0.19076182693243027, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.9620536267757416, + "step": 737 + }, + { + "completion_length": 1114.0201110839844, + "epoch": 0.2204465685908446, + "grad_norm": 0.704920768737793, + "kl": 0.376953125, + "learning_rate": 9.608277486452011e-07, + "loss": 0.0361, + "reward": 1.0223214775323868, + "reward_std": 0.11773011554032564, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.9799107611179352, + "step": 738 + }, + { + "completion_length": 1115.1317596435547, + "epoch": 0.22074527667836608, + "grad_norm": 0.5396891832351685, + "kl": 0.4267578125, + "learning_rate": 9.606359927878072e-07, + "loss": 0.04, + "reward": 1.0781250298023224, + "reward_std": 0.20927650853991508, + "rewards/accuracy_reward": 0.11830358020961285, + "rewards/format_reward": 0.9598214626312256, + "step": 739 + }, + { + "completion_length": 1210.0312805175781, + "epoch": 0.22104398476588755, + "grad_norm": 0.44531556963920593, + "kl": 0.3095703125, + "learning_rate": 9.604437901986995e-07, + "loss": 0.0357, + "reward": 1.1495536267757416, + "reward_std": 0.15865902043879032, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.973214328289032, + "step": 740 + }, + { + "completion_length": 1121.5692749023438, + "epoch": 0.22134269285340902, + "grad_norm": 0.4618227183818817, + "kl": 0.3154296875, + "learning_rate": 9.60251141086976e-07, + "loss": 0.0515, + "reward": 1.100446492433548, + "reward_std": 0.21203702315688133, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9687500596046448, + "step": 741 + }, + { + "completion_length": 1154.5536193847656, + "epoch": 0.2216414009409305, + "grad_norm": 0.3410845696926117, + "kl": 0.3505859375, + "learning_rate": 9.6005804566222e-07, + "loss": 0.0579, + "reward": 1.0758929252624512, + "reward_std": 0.19229715317487717, + "rewards/accuracy_reward": 0.10491072200238705, + "rewards/format_reward": 0.9709821790456772, + "step": 742 + }, + { + "completion_length": 1210.607177734375, + "epoch": 0.22194010902845193, + "grad_norm": 0.4574369192123413, + "kl": 0.34619140625, + "learning_rate": 9.598645041345005e-07, + "loss": 0.0341, + "reward": 1.0401786267757416, + "reward_std": 0.24843565374612808, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.964285746216774, + "step": 743 + }, + { + "completion_length": 1141.5826416015625, + "epoch": 0.2222388171159734, + "grad_norm": 0.6873401403427124, + "kl": 0.34228515625, + "learning_rate": 9.596705167143712e-07, + "loss": 0.0734, + "reward": 1.2343750596046448, + "reward_std": 0.29631584510207176, + "rewards/accuracy_reward": 0.2767857275903225, + "rewards/format_reward": 0.957589328289032, + "step": 744 + }, + { + "completion_length": 1176.19873046875, + "epoch": 0.22253752520349487, + "grad_norm": 0.5372123718261719, + "kl": 0.28466796875, + "learning_rate": 9.594760836128718e-07, + "loss": 0.0447, + "reward": 1.0825893431901932, + "reward_std": 0.18991845659911633, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.973214328289032, + "step": 745 + }, + { + "completion_length": 1187.0402526855469, + "epoch": 0.22283623329101634, + "grad_norm": 0.5773569345474243, + "kl": 0.26416015625, + "learning_rate": 9.592812050415264e-07, + "loss": 0.0385, + "reward": 1.1004464626312256, + "reward_std": 0.22933094948530197, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9754464775323868, + "step": 746 + }, + { + "completion_length": 1283.9576416015625, + "epoch": 0.22313494137853782, + "grad_norm": 0.4756278693675995, + "kl": 0.3291015625, + "learning_rate": 9.590858812123437e-07, + "loss": 0.0229, + "reward": 1.1250000596046448, + "reward_std": 0.13294352032244205, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9687500447034836, + "step": 747 + }, + { + "completion_length": 1228.3304138183594, + "epoch": 0.2234336494660593, + "grad_norm": 0.75160813331604, + "kl": 0.3251953125, + "learning_rate": 9.588901123378172e-07, + "loss": 0.0193, + "reward": 1.0848214626312256, + "reward_std": 0.1862485073506832, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/format_reward": 0.9598214477300644, + "step": 748 + }, + { + "completion_length": 1228.0000610351562, + "epoch": 0.22373235755358076, + "grad_norm": 1.3209093809127808, + "kl": 0.3193359375, + "learning_rate": 9.58693898630924e-07, + "loss": 0.0499, + "reward": 1.0870536267757416, + "reward_std": 0.25024398788809776, + "rewards/accuracy_reward": 0.13839286123402417, + "rewards/format_reward": 0.9486607611179352, + "step": 749 + }, + { + "completion_length": 1164.388427734375, + "epoch": 0.22403106564110223, + "grad_norm": 1.9981886148452759, + "kl": 0.268798828125, + "learning_rate": 9.584972403051252e-07, + "loss": 0.0404, + "reward": 1.1562500596046448, + "reward_std": 0.29363639652729034, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9553571939468384, + "step": 750 + }, + { + "completion_length": 1256.2656555175781, + "epoch": 0.2243297737286237, + "grad_norm": 1.1755807399749756, + "kl": 0.3271484375, + "learning_rate": 9.58300137574366e-07, + "loss": 0.0587, + "reward": 1.1250000894069672, + "reward_std": 0.26231881976127625, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9642857611179352, + "step": 751 + }, + { + "completion_length": 1196.37060546875, + "epoch": 0.22462848181614517, + "grad_norm": 0.8170644640922546, + "kl": 0.3125, + "learning_rate": 9.581025906530752e-07, + "loss": 0.0307, + "reward": 1.0937500596046448, + "reward_std": 0.23870237171649933, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9553571790456772, + "step": 752 + }, + { + "completion_length": 1181.0692138671875, + "epoch": 0.22492718990366664, + "grad_norm": 0.597294807434082, + "kl": 0.389404296875, + "learning_rate": 9.579045997561637e-07, + "loss": 0.0385, + "reward": 1.1138393580913544, + "reward_std": 0.21907248347997665, + "rewards/accuracy_reward": 0.16517857555299997, + "rewards/format_reward": 0.9486607611179352, + "step": 753 + }, + { + "completion_length": 1234.3817749023438, + "epoch": 0.2252258979911881, + "grad_norm": 0.6490675806999207, + "kl": 0.3583984375, + "learning_rate": 9.577061650990266e-07, + "loss": 0.0323, + "reward": 1.147321492433548, + "reward_std": 0.258691631257534, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.964285746216774, + "step": 754 + }, + { + "completion_length": 1211.1004943847656, + "epoch": 0.22552460607870958, + "grad_norm": 1.0024709701538086, + "kl": 0.49755859375, + "learning_rate": 9.575072868975416e-07, + "loss": 0.0853, + "reward": 0.9933036118745804, + "reward_std": 0.24141618981957436, + "rewards/accuracy_reward": 0.058035718742758036, + "rewards/format_reward": 0.9352678954601288, + "step": 755 + }, + { + "completion_length": 1193.6094360351562, + "epoch": 0.22582331416623105, + "grad_norm": 7.178197860717773, + "kl": 0.556640625, + "learning_rate": 9.573079653680685e-07, + "loss": 0.0457, + "reward": 1.111607164144516, + "reward_std": 0.27741459757089615, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9441964775323868, + "step": 756 + }, + { + "completion_length": 1216.69873046875, + "epoch": 0.22612202225375252, + "grad_norm": 6.721797466278076, + "kl": 0.6201171875, + "learning_rate": 9.571082007274493e-07, + "loss": 0.0301, + "reward": 1.0758928954601288, + "reward_std": 0.2806642912328243, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.9196428954601288, + "step": 757 + }, + { + "completion_length": 1248.2478332519531, + "epoch": 0.226420730341274, + "grad_norm": 1.4763973951339722, + "kl": 0.61572265625, + "learning_rate": 9.56907993193009e-07, + "loss": 0.0709, + "reward": 1.0803571939468384, + "reward_std": 0.2967090830206871, + "rewards/accuracy_reward": 0.1540178693830967, + "rewards/format_reward": 0.926339328289032, + "step": 758 + }, + { + "completion_length": 1220.3460388183594, + "epoch": 0.22671943842879547, + "grad_norm": 1.3739585876464844, + "kl": 0.60498046875, + "learning_rate": 9.56707342982553e-07, + "loss": 0.0408, + "reward": 1.0022322088479996, + "reward_std": 0.2732303813099861, + "rewards/accuracy_reward": 0.06473214598372579, + "rewards/format_reward": 0.9375000447034836, + "step": 759 + }, + { + "completion_length": 1194.3929443359375, + "epoch": 0.22701814651631694, + "grad_norm": 1.6887761354446411, + "kl": 0.5107421875, + "learning_rate": 9.565062503143696e-07, + "loss": 0.0207, + "reward": 1.1250000447034836, + "reward_std": 0.22436760365962982, + "rewards/accuracy_reward": 0.17857143771834671, + "rewards/format_reward": 0.9464286118745804, + "step": 760 + }, + { + "completion_length": 1188.7545166015625, + "epoch": 0.2273168546038384, + "grad_norm": 1.1425880193710327, + "kl": 0.38037109375, + "learning_rate": 9.563047154072275e-07, + "loss": 0.0072, + "reward": 1.0156250298023224, + "reward_std": 0.14390334580093622, + "rewards/accuracy_reward": 0.04464286030270159, + "rewards/format_reward": 0.9709821939468384, + "step": 761 + }, + { + "completion_length": 1109.6741638183594, + "epoch": 0.22761556269135988, + "grad_norm": 0.8302162885665894, + "kl": 0.3212890625, + "learning_rate": 9.561027384803776e-07, + "loss": -0.0059, + "reward": 1.1250000298023224, + "reward_std": 0.1939203068614006, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.9508928954601288, + "step": 762 + }, + { + "completion_length": 1166.0246276855469, + "epoch": 0.22791427077888135, + "grad_norm": 0.7167302370071411, + "kl": 0.248046875, + "learning_rate": 9.559003197535502e-07, + "loss": 0.0151, + "reward": 1.1763393580913544, + "reward_std": 0.23332138173282146, + "rewards/accuracy_reward": 0.2142857275903225, + "rewards/format_reward": 0.9620536118745804, + "step": 763 + }, + { + "completion_length": 1139.700927734375, + "epoch": 0.22821297886640282, + "grad_norm": 0.753153383731842, + "kl": 0.277099609375, + "learning_rate": 9.556974594469577e-07, + "loss": 0.0257, + "reward": 1.1875000596046448, + "reward_std": 0.19524182379245758, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9754464477300644, + "step": 764 + }, + { + "completion_length": 1207.1920166015625, + "epoch": 0.22851168695392426, + "grad_norm": 0.30064094066619873, + "kl": 0.19775390625, + "learning_rate": 9.554941577812918e-07, + "loss": 0.0285, + "reward": 1.1093750596046448, + "reward_std": 0.15332257747650146, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.9821429252624512, + "step": 765 + }, + { + "completion_length": 1098.6942443847656, + "epoch": 0.22881039504144574, + "grad_norm": 0.6055102348327637, + "kl": 0.19921875, + "learning_rate": 9.55290414977725e-07, + "loss": 0.0337, + "reward": 1.2544643580913544, + "reward_std": 0.280720517039299, + "rewards/accuracy_reward": 0.2857142984867096, + "rewards/format_reward": 0.9687500447034836, + "step": 766 + }, + { + "completion_length": 1152.1830749511719, + "epoch": 0.2291091031289672, + "grad_norm": 0.6614567041397095, + "kl": 0.1483154296875, + "learning_rate": 9.550862312579094e-07, + "loss": 0.0321, + "reward": 1.1406250298023224, + "reward_std": 0.23068571835756302, + "rewards/accuracy_reward": 0.17187500861473382, + "rewards/format_reward": 0.9687500447034836, + "step": 767 + }, + { + "completion_length": 1226.9732666015625, + "epoch": 0.22940781121648868, + "grad_norm": 0.3057483434677124, + "kl": 0.1634521484375, + "learning_rate": 9.548816068439768e-07, + "loss": 0.0147, + "reward": 1.0982143580913544, + "reward_std": 0.1863052025437355, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9709821790456772, + "step": 768 + }, + { + "completion_length": 1249.2523193359375, + "epoch": 0.22970651930401015, + "grad_norm": 0.6082642078399658, + "kl": 0.2109375, + "learning_rate": 9.546765419585388e-07, + "loss": 0.0151, + "reward": 1.1875000596046448, + "reward_std": 0.1991983875632286, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.9709821790456772, + "step": 769 + }, + { + "completion_length": 1188.2857666015625, + "epoch": 0.23000522739153162, + "grad_norm": 0.7384894490242004, + "kl": 0.193359375, + "learning_rate": 9.544710368246856e-07, + "loss": 0.0358, + "reward": 1.1250000596046448, + "reward_std": 0.20567161962389946, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.9642857760190964, + "step": 770 + }, + { + "completion_length": 1173.450927734375, + "epoch": 0.2303039354790531, + "grad_norm": 0.8992297053337097, + "kl": 0.1734619140625, + "learning_rate": 9.542650916659869e-07, + "loss": 0.0273, + "reward": 1.1183036267757416, + "reward_std": 0.17654634732753038, + "rewards/accuracy_reward": 0.14285715483129025, + "rewards/format_reward": 0.9754464477300644, + "step": 771 + }, + { + "completion_length": 1234.8705749511719, + "epoch": 0.23060264356657456, + "grad_norm": 1.6773887872695923, + "kl": 0.22705078125, + "learning_rate": 9.540587067064905e-07, + "loss": 0.0722, + "reward": 1.1093750596046448, + "reward_std": 0.28002748265862465, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9397321790456772, + "step": 772 + }, + { + "completion_length": 1156.7188110351562, + "epoch": 0.23090135165409603, + "grad_norm": 0.7083947658538818, + "kl": 0.222900390625, + "learning_rate": 9.538518821707231e-07, + "loss": 0.0278, + "reward": 1.1272321939468384, + "reward_std": 0.2520678676664829, + "rewards/accuracy_reward": 0.16517858393490314, + "rewards/format_reward": 0.9620536267757416, + "step": 773 + }, + { + "completion_length": 1133.2254943847656, + "epoch": 0.2312000597416175, + "grad_norm": 0.6425380706787109, + "kl": 0.252685546875, + "learning_rate": 9.5364461828369e-07, + "loss": -0.0091, + "reward": 1.0580357611179352, + "reward_std": 0.14062084071338177, + "rewards/accuracy_reward": 0.09151786030270159, + "rewards/format_reward": 0.9665178954601288, + "step": 774 + }, + { + "completion_length": 1142.1830749511719, + "epoch": 0.23149876782913897, + "grad_norm": 0.35425907373428345, + "kl": 0.288330078125, + "learning_rate": 9.534369152708735e-07, + "loss": 0.0072, + "reward": 1.2008929252624512, + "reward_std": 0.24430953338742256, + "rewards/accuracy_reward": 0.2366071566939354, + "rewards/format_reward": 0.964285746216774, + "step": 775 + }, + { + "completion_length": 1037.156265258789, + "epoch": 0.23179747591666044, + "grad_norm": 0.389037549495697, + "kl": 0.271728515625, + "learning_rate": 9.532287733582343e-07, + "loss": 0.0282, + "reward": 1.1049107611179352, + "reward_std": 0.21012501046061516, + "rewards/accuracy_reward": 0.12946429289877415, + "rewards/format_reward": 0.9754464477300644, + "step": 776 + }, + { + "completion_length": 1092.3527221679688, + "epoch": 0.23209618400418192, + "grad_norm": 0.670000433921814, + "kl": 0.283203125, + "learning_rate": 9.530201927722103e-07, + "loss": -0.0025, + "reward": 1.116071492433548, + "reward_std": 0.16766181401908398, + "rewards/accuracy_reward": 0.13392857508733869, + "rewards/format_reward": 0.9821428954601288, + "step": 777 + }, + { + "completion_length": 1081.9130096435547, + "epoch": 0.2323948920917034, + "grad_norm": 0.5657044649124146, + "kl": 0.231689453125, + "learning_rate": 9.528111737397167e-07, + "loss": 0.0194, + "reward": 1.1428571939468384, + "reward_std": 0.25453799962997437, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/format_reward": 0.9553571790456772, + "step": 778 + }, + { + "completion_length": 1150.5670166015625, + "epoch": 0.23269360017922486, + "grad_norm": 0.4647386074066162, + "kl": 0.283203125, + "learning_rate": 9.526017164881459e-07, + "loss": 0.0207, + "reward": 1.113839328289032, + "reward_std": 0.25564899668097496, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9531250447034836, + "step": 779 + }, + { + "completion_length": 1073.2857818603516, + "epoch": 0.23299230826674633, + "grad_norm": 0.46131283044815063, + "kl": 0.35400390625, + "learning_rate": 9.523918212453668e-07, + "loss": 0.0331, + "reward": 1.098214328289032, + "reward_std": 0.18377683870494366, + "rewards/accuracy_reward": 0.11607143515720963, + "rewards/format_reward": 0.9821428954601288, + "step": 780 + }, + { + "completion_length": 1053.9955749511719, + "epoch": 0.2332910163542678, + "grad_norm": 0.5300223231315613, + "kl": 0.27734375, + "learning_rate": 9.521814882397247e-07, + "loss": 0.0304, + "reward": 1.0803572088479996, + "reward_std": 0.1775421667844057, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.957589328289032, + "step": 781 + }, + { + "completion_length": 1085.4710540771484, + "epoch": 0.23358972444178927, + "grad_norm": 0.6137241721153259, + "kl": 0.32275390625, + "learning_rate": 9.519707177000414e-07, + "loss": 0.0217, + "reward": 1.189732164144516, + "reward_std": 0.16660825908184052, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9866071939468384, + "step": 782 + }, + { + "completion_length": 1083.2411193847656, + "epoch": 0.23388843252931074, + "grad_norm": 0.4248321056365967, + "kl": 0.3681640625, + "learning_rate": 9.517595098556148e-07, + "loss": 0.0163, + "reward": 1.116071492433548, + "reward_std": 0.1949775032699108, + "rewards/accuracy_reward": 0.14285714738070965, + "rewards/format_reward": 0.9732143133878708, + "step": 783 + }, + { + "completion_length": 1075.5558624267578, + "epoch": 0.2341871406168322, + "grad_norm": 0.6082937717437744, + "kl": 0.28369140625, + "learning_rate": 9.51547864936218e-07, + "loss": 0.0232, + "reward": 1.0691964626312256, + "reward_std": 0.16002090275287628, + "rewards/accuracy_reward": 0.09375000395812094, + "rewards/format_reward": 0.9754464775323868, + "step": 784 + }, + { + "completion_length": 1060.4866638183594, + "epoch": 0.23448584870435368, + "grad_norm": 0.7637636065483093, + "kl": 0.32421875, + "learning_rate": 9.513357831721002e-07, + "loss": 0.0298, + "reward": 1.1361607611179352, + "reward_std": 0.20002071652561426, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.9799107611179352, + "step": 785 + }, + { + "completion_length": 1057.2366333007812, + "epoch": 0.23478455679187513, + "grad_norm": 0.5262110829353333, + "kl": 0.33447265625, + "learning_rate": 9.511232647939852e-07, + "loss": 0.033, + "reward": 1.0558036118745804, + "reward_std": 0.18093757331371307, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.9642857611179352, + "step": 786 + }, + { + "completion_length": 1138.810302734375, + "epoch": 0.2350832648793966, + "grad_norm": 0.6328285932540894, + "kl": 0.289794921875, + "learning_rate": 9.509103100330727e-07, + "loss": 0.0113, + "reward": 1.0424107611179352, + "reward_std": 0.20929799787700176, + "rewards/accuracy_reward": 0.07589286286383867, + "rewards/format_reward": 0.96651791036129, + "step": 787 + }, + { + "completion_length": 1044.0625457763672, + "epoch": 0.23538197296691807, + "grad_norm": 0.4786671996116638, + "kl": 0.323974609375, + "learning_rate": 9.506969191210362e-07, + "loss": 0.0318, + "reward": 1.2433036267757416, + "reward_std": 0.24100054427981377, + "rewards/accuracy_reward": 0.2790178693830967, + "rewards/format_reward": 0.9642857611179352, + "step": 788 + }, + { + "completion_length": 1021.5759429931641, + "epoch": 0.23568068105443954, + "grad_norm": 0.5971327424049377, + "kl": 0.3408203125, + "learning_rate": 9.504830922900241e-07, + "loss": 0.0638, + "reward": 1.1383929252624512, + "reward_std": 0.15374589897692204, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.9821428805589676, + "step": 789 + }, + { + "completion_length": 1013.7299652099609, + "epoch": 0.235979389141961, + "grad_norm": 0.5882115364074707, + "kl": 0.2724609375, + "learning_rate": 9.502688297726594e-07, + "loss": 0.0454, + "reward": 1.1629464626312256, + "reward_std": 0.21399031020700932, + "rewards/accuracy_reward": 0.18973214854486287, + "rewards/format_reward": 0.973214328289032, + "step": 790 + }, + { + "completion_length": 1023.9755096435547, + "epoch": 0.23627809722948248, + "grad_norm": 0.5221239328384399, + "kl": 0.291015625, + "learning_rate": 9.500541318020382e-07, + "loss": 0.0208, + "reward": 1.194196492433548, + "reward_std": 0.17957623302936554, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.9732143133878708, + "step": 791 + }, + { + "completion_length": 1022.2165374755859, + "epoch": 0.23657680531700395, + "grad_norm": 0.6816852688789368, + "kl": 0.249267578125, + "learning_rate": 9.498389986117312e-07, + "loss": 0.0284, + "reward": 1.178571492433548, + "reward_std": 0.20104889944195747, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9821428805589676, + "step": 792 + }, + { + "completion_length": 1080.5312805175781, + "epoch": 0.23687551340452542, + "grad_norm": 0.38126030564308167, + "kl": 0.302001953125, + "learning_rate": 9.496234304357822e-07, + "loss": 0.0273, + "reward": 1.066964328289032, + "reward_std": 0.16046492010354996, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.9754464626312256, + "step": 793 + }, + { + "completion_length": 973.6205902099609, + "epoch": 0.2371742214920469, + "grad_norm": 0.761563777923584, + "kl": 0.21337890625, + "learning_rate": 9.494074275087081e-07, + "loss": 0.0403, + "reward": 1.2232143580913544, + "reward_std": 0.19499630481004715, + "rewards/accuracy_reward": 0.243303582072258, + "rewards/format_reward": 0.979910746216774, + "step": 794 + }, + { + "completion_length": 976.3750305175781, + "epoch": 0.23747292957956836, + "grad_norm": 0.5218312740325928, + "kl": 0.271240234375, + "learning_rate": 9.49190990065499e-07, + "loss": 0.0418, + "reward": 1.145089328289032, + "reward_std": 0.20439547300338745, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.9687500447034836, + "step": 795 + }, + { + "completion_length": 1025.9620971679688, + "epoch": 0.23777163766708984, + "grad_norm": 0.6910086870193481, + "kl": 0.31982421875, + "learning_rate": 9.489741183416178e-07, + "loss": 0.0385, + "reward": 1.1383929252624512, + "reward_std": 0.2411033697426319, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9709821939468384, + "step": 796 + }, + { + "completion_length": 1009.8214569091797, + "epoch": 0.2380703457546113, + "grad_norm": 0.3821786642074585, + "kl": 0.2646484375, + "learning_rate": 9.487568125729994e-07, + "loss": 0.0141, + "reward": 1.0379464775323868, + "reward_std": 0.1627989998087287, + "rewards/accuracy_reward": 0.06473214784637094, + "rewards/format_reward": 0.9732143133878708, + "step": 797 + }, + { + "completion_length": 1122.247802734375, + "epoch": 0.23836905384213278, + "grad_norm": 0.6718454957008362, + "kl": 0.43212890625, + "learning_rate": 9.485390729960514e-07, + "loss": 0.0683, + "reward": 1.022321492433548, + "reward_std": 0.2651587910950184, + "rewards/accuracy_reward": 0.08035714854486287, + "rewards/format_reward": 0.941964328289032, + "step": 798 + }, + { + "completion_length": 1007.3661041259766, + "epoch": 0.23866776192965425, + "grad_norm": 0.4284169673919678, + "kl": 0.249755859375, + "learning_rate": 9.483208998476529e-07, + "loss": 0.0328, + "reward": 1.1450893580913544, + "reward_std": 0.2139020748436451, + "rewards/accuracy_reward": 0.16964286309666932, + "rewards/format_reward": 0.9754464775323868, + "step": 799 + }, + { + "completion_length": 1060.7723541259766, + "epoch": 0.23896647001717572, + "grad_norm": 0.42949748039245605, + "kl": 0.2626953125, + "learning_rate": 9.481022933651549e-07, + "loss": 0.0176, + "reward": 1.209821492433548, + "reward_std": 0.22594887763261795, + "rewards/accuracy_reward": 0.232142873108387, + "rewards/format_reward": 0.9776786118745804, + "step": 800 + }, + { + "completion_length": 1058.0201263427734, + "epoch": 0.2392651781046972, + "grad_norm": 0.48062288761138916, + "kl": 0.339111328125, + "learning_rate": 9.478832537863801e-07, + "loss": 0.0238, + "reward": 1.1049107909202576, + "reward_std": 0.241295013576746, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.973214328289032, + "step": 801 + }, + { + "completion_length": 952.1406555175781, + "epoch": 0.23956388619221866, + "grad_norm": 0.6209874749183655, + "kl": 0.3828125, + "learning_rate": 9.476637813496219e-07, + "loss": 0.0424, + "reward": 1.0892857909202576, + "reward_std": 0.20153307169675827, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.9665178805589676, + "step": 802 + }, + { + "completion_length": 1023.7522735595703, + "epoch": 0.23986259427974013, + "grad_norm": 0.5614138245582581, + "kl": 0.37158203125, + "learning_rate": 9.474438762936449e-07, + "loss": 0.0175, + "reward": 1.0870536267757416, + "reward_std": 0.24095315486192703, + "rewards/accuracy_reward": 0.14732143748551607, + "rewards/format_reward": 0.9397321939468384, + "step": 803 + }, + { + "completion_length": 1116.5603332519531, + "epoch": 0.2401613023672616, + "grad_norm": 1.3242069482803345, + "kl": 0.39990234375, + "learning_rate": 9.47223538857684e-07, + "loss": 0.0471, + "reward": 1.0714285969734192, + "reward_std": 0.26388124376535416, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.95089291036129, + "step": 804 + }, + { + "completion_length": 1042.886215209961, + "epoch": 0.24046001045478307, + "grad_norm": 0.5222337245941162, + "kl": 0.298583984375, + "learning_rate": 9.470027692814451e-07, + "loss": -0.0091, + "reward": 1.2366071939468384, + "reward_std": 0.2715054973959923, + "rewards/accuracy_reward": 0.2745535783469677, + "rewards/format_reward": 0.9620535969734192, + "step": 805 + }, + { + "completion_length": 1098.9442749023438, + "epoch": 0.24075871854230455, + "grad_norm": 0.6245018839836121, + "kl": 0.303466796875, + "learning_rate": 9.467815678051036e-07, + "loss": 0.0213, + "reward": 1.2299107611179352, + "reward_std": 0.1834851112216711, + "rewards/accuracy_reward": 0.2566964440047741, + "rewards/format_reward": 0.973214328289032, + "step": 806 + }, + { + "completion_length": 1000.9219207763672, + "epoch": 0.24105742662982602, + "grad_norm": 0.485649973154068, + "kl": 0.2578125, + "learning_rate": 9.46559934669305e-07, + "loss": 0.0086, + "reward": 1.1696428954601288, + "reward_std": 0.17758429795503616, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9776786118745804, + "step": 807 + }, + { + "completion_length": 1131.1808471679688, + "epoch": 0.24135613471734746, + "grad_norm": 0.6818820834159851, + "kl": 0.3564453125, + "learning_rate": 9.463378701151646e-07, + "loss": 0.0147, + "reward": 1.0446428805589676, + "reward_std": 0.20061798579990864, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.9620536118745804, + "step": 808 + }, + { + "completion_length": 1120.7835388183594, + "epoch": 0.24165484280486893, + "grad_norm": 0.541103184223175, + "kl": 0.317138671875, + "learning_rate": 9.461153743842668e-07, + "loss": 0.036, + "reward": 0.9977679252624512, + "reward_std": 0.14751976821571589, + "rewards/accuracy_reward": 0.024553572293370962, + "rewards/format_reward": 0.9732143133878708, + "step": 809 + }, + { + "completion_length": 1121.7054138183594, + "epoch": 0.2419535508923904, + "grad_norm": 0.4892750382423401, + "kl": 0.254150390625, + "learning_rate": 9.458924477186651e-07, + "loss": 0.0234, + "reward": 1.1205357611179352, + "reward_std": 0.16641459800302982, + "rewards/accuracy_reward": 0.14508929196745157, + "rewards/format_reward": 0.9754464775323868, + "step": 810 + }, + { + "completion_length": 1190.2277526855469, + "epoch": 0.24225225897991187, + "grad_norm": 0.8165367841720581, + "kl": 0.2353515625, + "learning_rate": 9.456690903608822e-07, + "loss": 0.026, + "reward": 1.1316964626312256, + "reward_std": 0.22955770418047905, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9598214775323868, + "step": 811 + }, + { + "completion_length": 1224.1295471191406, + "epoch": 0.24255096706743334, + "grad_norm": 0.8650496602058411, + "kl": 0.240966796875, + "learning_rate": 9.454453025539084e-07, + "loss": 0.0492, + "reward": 1.1227678954601288, + "reward_std": 0.22921208292245865, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.9642857611179352, + "step": 812 + }, + { + "completion_length": 1160.1406555175781, + "epoch": 0.2428496751549548, + "grad_norm": 0.6316642165184021, + "kl": 0.23828125, + "learning_rate": 9.452210845412032e-07, + "loss": 0.0471, + "reward": 1.1071428954601288, + "reward_std": 0.2184934876859188, + "rewards/accuracy_reward": 0.13616071757860482, + "rewards/format_reward": 0.9709821939468384, + "step": 813 + }, + { + "completion_length": 1151.5804138183594, + "epoch": 0.24314838324247628, + "grad_norm": 0.5557881593704224, + "kl": 0.267333984375, + "learning_rate": 9.449964365666941e-07, + "loss": 0.0726, + "reward": 1.0714286416769028, + "reward_std": 0.23206552490592003, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.96651791036129, + "step": 814 + }, + { + "completion_length": 1067.8504943847656, + "epoch": 0.24344709132999776, + "grad_norm": 1.3594590425491333, + "kl": 0.271240234375, + "learning_rate": 9.447713588747756e-07, + "loss": 0.0372, + "reward": 1.0937500596046448, + "reward_std": 0.23802470788359642, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.9687500447034836, + "step": 815 + }, + { + "completion_length": 1078.2031555175781, + "epoch": 0.24374579941751923, + "grad_norm": 0.5685669183731079, + "kl": 0.294921875, + "learning_rate": 9.445458517103105e-07, + "loss": 0.0207, + "reward": 1.1495536267757416, + "reward_std": 0.1755993589758873, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.9754464626312256, + "step": 816 + }, + { + "completion_length": 1097.7835235595703, + "epoch": 0.2440445075050407, + "grad_norm": 0.7009775042533875, + "kl": 0.284912109375, + "learning_rate": 9.443199153186284e-07, + "loss": 0.0424, + "reward": 1.1383929252624512, + "reward_std": 0.17156792990863323, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9821428954601288, + "step": 817 + }, + { + "completion_length": 1111.7813110351562, + "epoch": 0.24434321559256217, + "grad_norm": 0.36823558807373047, + "kl": 0.32861328125, + "learning_rate": 9.440935499455259e-07, + "loss": 0.036, + "reward": 1.051339328289032, + "reward_std": 0.16175739467144012, + "rewards/accuracy_reward": 0.07366071688011289, + "rewards/format_reward": 0.9776786118745804, + "step": 818 + }, + { + "completion_length": 1095.685302734375, + "epoch": 0.24464192368008364, + "grad_norm": 0.7664465308189392, + "kl": 0.3486328125, + "learning_rate": 9.438667558372665e-07, + "loss": 0.0254, + "reward": 1.142857164144516, + "reward_std": 0.17795905098319054, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.9687500298023224, + "step": 819 + }, + { + "completion_length": 1119.3683776855469, + "epoch": 0.2449406317676051, + "grad_norm": 0.6149770617485046, + "kl": 0.3271484375, + "learning_rate": 9.436395332405798e-07, + "loss": 0.0256, + "reward": 1.1428571939468384, + "reward_std": 0.23041700199246407, + "rewards/accuracy_reward": 0.16741072479635477, + "rewards/format_reward": 0.9754464775323868, + "step": 820 + }, + { + "completion_length": 1127.7411193847656, + "epoch": 0.24523933985512658, + "grad_norm": 0.5890131592750549, + "kl": 0.3623046875, + "learning_rate": 9.434118824026616e-07, + "loss": 0.0443, + "reward": 1.100446492433548, + "reward_std": 0.275294940918684, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9598214626312256, + "step": 821 + }, + { + "completion_length": 1119.2388916015625, + "epoch": 0.24553804794264805, + "grad_norm": 0.8687578439712524, + "kl": 0.33984375, + "learning_rate": 9.43183803571174e-07, + "loss": 0.0217, + "reward": 1.0736607611179352, + "reward_std": 0.2138892225921154, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.9665178954601288, + "step": 822 + }, + { + "completion_length": 1090.3192291259766, + "epoch": 0.24583675603016952, + "grad_norm": 0.45854273438453674, + "kl": 0.2783203125, + "learning_rate": 9.429552969942443e-07, + "loss": 0.0169, + "reward": 1.1406250298023224, + "reward_std": 0.2398284114897251, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9709821790456772, + "step": 823 + }, + { + "completion_length": 1149.4286193847656, + "epoch": 0.246135464117691, + "grad_norm": 0.4550832509994507, + "kl": 0.3291015625, + "learning_rate": 9.427263629204651e-07, + "loss": 0.0183, + "reward": 1.1227679252624512, + "reward_std": 0.24596641585230827, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.9709821939468384, + "step": 824 + }, + { + "completion_length": 1108.354965209961, + "epoch": 0.24643417220521247, + "grad_norm": 0.40300238132476807, + "kl": 0.246826171875, + "learning_rate": 9.424970015988943e-07, + "loss": 0.0325, + "reward": 1.0803571939468384, + "reward_std": 0.1428143410012126, + "rewards/accuracy_reward": 0.09375000488944352, + "rewards/format_reward": 0.9866071790456772, + "step": 825 + }, + { + "completion_length": 1115.154052734375, + "epoch": 0.24673288029273394, + "grad_norm": 0.4521799087524414, + "kl": 0.263916015625, + "learning_rate": 9.422672132790549e-07, + "loss": 0.0029, + "reward": 1.2142857611179352, + "reward_std": 0.1637880578637123, + "rewards/accuracy_reward": 0.2343750074505806, + "rewards/format_reward": 0.9799107313156128, + "step": 826 + }, + { + "completion_length": 1163.9665832519531, + "epoch": 0.2470315883802554, + "grad_norm": 0.5570716857910156, + "kl": 0.35888671875, + "learning_rate": 9.420369982109335e-07, + "loss": 0.0253, + "reward": 1.1339286118745804, + "reward_std": 0.1964830756187439, + "rewards/accuracy_reward": 0.15625000232830644, + "rewards/format_reward": 0.9776786118745804, + "step": 827 + }, + { + "completion_length": 1099.8214721679688, + "epoch": 0.24733029646777688, + "grad_norm": 0.6588758230209351, + "kl": 0.27978515625, + "learning_rate": 9.41806356644982e-07, + "loss": 0.0057, + "reward": 1.0558036267757416, + "reward_std": 0.2496199943125248, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.9620536267757416, + "step": 828 + }, + { + "completion_length": 1107.9978332519531, + "epoch": 0.24762900455529832, + "grad_norm": 0.34268438816070557, + "kl": 0.287109375, + "learning_rate": 9.415752888321154e-07, + "loss": 0.0291, + "reward": 1.1116071939468384, + "reward_std": 0.1337930653244257, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.979910746216774, + "step": 829 + }, + { + "completion_length": 1082.8861999511719, + "epoch": 0.2479277126428198, + "grad_norm": 0.42542436718940735, + "kl": 0.24072265625, + "learning_rate": 9.413437950237131e-07, + "loss": 0.0202, + "reward": 1.053571492433548, + "reward_std": 0.14192692004144192, + "rewards/accuracy_reward": 0.07812500186264515, + "rewards/format_reward": 0.9754464775323868, + "step": 830 + }, + { + "completion_length": 1103.669677734375, + "epoch": 0.24822642073034126, + "grad_norm": 0.5867838263511658, + "kl": 0.253173828125, + "learning_rate": 9.411118754716177e-07, + "loss": 0.0369, + "reward": 1.1160714626312256, + "reward_std": 0.19084028527140617, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9754464775323868, + "step": 831 + }, + { + "completion_length": 1145.1920471191406, + "epoch": 0.24852512881786273, + "grad_norm": 0.8226855397224426, + "kl": 0.25048828125, + "learning_rate": 9.40879530428135e-07, + "loss": 0.0373, + "reward": 1.1160714477300644, + "reward_std": 0.17415086086839437, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9754464626312256, + "step": 832 + }, + { + "completion_length": 1124.8929138183594, + "epoch": 0.2488238369053842, + "grad_norm": 1.680649757385254, + "kl": 0.337890625, + "learning_rate": 9.406467601460333e-07, + "loss": 0.0499, + "reward": 1.0647322237491608, + "reward_std": 0.22633282467722893, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.9598214626312256, + "step": 833 + }, + { + "completion_length": 1093.4420166015625, + "epoch": 0.24912254499290568, + "grad_norm": 0.8655956387519836, + "kl": 0.28125, + "learning_rate": 9.404135648785441e-07, + "loss": 0.0359, + "reward": 1.0558036267757416, + "reward_std": 0.1540310475975275, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.9709821939468384, + "step": 834 + }, + { + "completion_length": 1168.1205749511719, + "epoch": 0.24942125308042715, + "grad_norm": 0.49104753136634827, + "kl": 0.271240234375, + "learning_rate": 9.40179944879361e-07, + "loss": 0.0319, + "reward": 1.1294643133878708, + "reward_std": 0.2015400528907776, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9665178954601288, + "step": 835 + }, + { + "completion_length": 1150.2142944335938, + "epoch": 0.24971996116794862, + "grad_norm": 0.7227329015731812, + "kl": 0.29296875, + "learning_rate": 9.399459004026396e-07, + "loss": 0.0263, + "reward": 1.111607164144516, + "reward_std": 0.22180694341659546, + "rewards/accuracy_reward": 0.1272321529686451, + "rewards/format_reward": 0.9843750596046448, + "step": 836 + }, + { + "completion_length": 1112.3996276855469, + "epoch": 0.2500186692554701, + "grad_norm": 0.46920958161354065, + "kl": 0.28955078125, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0329, + "reward": 1.1316964626312256, + "reward_std": 0.22036587074398994, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9732143133878708, + "step": 837 + }, + { + "completion_length": 1130.5848541259766, + "epoch": 0.2503173773429916, + "grad_norm": 0.7054442167282104, + "kl": 0.30859375, + "learning_rate": 9.394765390355133e-07, + "loss": 0.0506, + "reward": 1.2008929252624512, + "reward_std": 0.22870584577322006, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9687500447034836, + "step": 838 + }, + { + "completion_length": 1094.6473693847656, + "epoch": 0.25061608543051306, + "grad_norm": 0.6102554798126221, + "kl": 0.28125, + "learning_rate": 9.392412226557275e-07, + "loss": 0.0191, + "reward": 1.0401786267757416, + "reward_std": 0.1814846470952034, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.964285746216774, + "step": 839 + }, + { + "completion_length": 1083.875015258789, + "epoch": 0.2509147935180345, + "grad_norm": 0.5029820799827576, + "kl": 0.29736328125, + "learning_rate": 9.390054828196412e-07, + "loss": 0.0339, + "reward": 1.1741072237491608, + "reward_std": 0.18872937932610512, + "rewards/accuracy_reward": 0.20535715483129025, + "rewards/format_reward": 0.9687500298023224, + "step": 840 + }, + { + "completion_length": 1082.8572082519531, + "epoch": 0.25121350160555594, + "grad_norm": 0.800803005695343, + "kl": 0.37158203125, + "learning_rate": 9.387693197837162e-07, + "loss": 0.0364, + "reward": 1.127232164144516, + "reward_std": 0.13243891298770905, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.988839328289032, + "step": 841 + }, + { + "completion_length": 1004.6853179931641, + "epoch": 0.2515122096930774, + "grad_norm": 0.9549276232719421, + "kl": 0.300537109375, + "learning_rate": 9.385327338048749e-07, + "loss": 0.0317, + "reward": 1.1517857760190964, + "reward_std": 0.1338613387197256, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9843750447034836, + "step": 842 + }, + { + "completion_length": 1126.77685546875, + "epoch": 0.2518109177805989, + "grad_norm": 1.4896636009216309, + "kl": 0.314453125, + "learning_rate": 9.382957251404995e-07, + "loss": 0.0064, + "reward": 1.0937500596046448, + "reward_std": 0.1599847637116909, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.973214328289032, + "step": 843 + }, + { + "completion_length": 1118.591552734375, + "epoch": 0.25210962586812036, + "grad_norm": 0.42455658316612244, + "kl": 0.29638671875, + "learning_rate": 9.38058294048432e-07, + "loss": 0.033, + "reward": 1.1406250596046448, + "reward_std": 0.20334450900554657, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.979910746216774, + "step": 844 + }, + { + "completion_length": 1035.1719360351562, + "epoch": 0.25240833395564183, + "grad_norm": 0.24835386872291565, + "kl": 0.25146484375, + "learning_rate": 9.378204407869747e-07, + "loss": 0.014, + "reward": 1.1830357611179352, + "reward_std": 0.183792132884264, + "rewards/accuracy_reward": 0.2031250149011612, + "rewards/format_reward": 0.979910746216774, + "step": 845 + }, + { + "completion_length": 1038.5067291259766, + "epoch": 0.2527070420431633, + "grad_norm": 0.5641106963157654, + "kl": 0.251953125, + "learning_rate": 9.37582165614888e-07, + "loss": 0.0207, + "reward": 1.1495536267757416, + "reward_std": 0.16468071565032005, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9888392984867096, + "step": 846 + }, + { + "completion_length": 1091.9330749511719, + "epoch": 0.25300575013068477, + "grad_norm": 0.6576843857765198, + "kl": 0.290283203125, + "learning_rate": 9.373434687913924e-07, + "loss": 0.0399, + "reward": 1.0625000447034836, + "reward_std": 0.1886096689850092, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.9620536118745804, + "step": 847 + }, + { + "completion_length": 1068.118408203125, + "epoch": 0.25330445821820624, + "grad_norm": 0.897584080696106, + "kl": 0.32080078125, + "learning_rate": 9.371043505761664e-07, + "loss": 0.0076, + "reward": 1.0982143580913544, + "reward_std": 0.10229229833930731, + "rewards/accuracy_reward": 0.12276786426082253, + "rewards/format_reward": 0.9754464626312256, + "step": 848 + }, + { + "completion_length": 1092.4598999023438, + "epoch": 0.2536031663057277, + "grad_norm": 0.4416831433773041, + "kl": 0.23779296875, + "learning_rate": 9.368648112293474e-07, + "loss": 0.0216, + "reward": 1.1071429252624512, + "reward_std": 0.24122978001832962, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.9687500447034836, + "step": 849 + }, + { + "completion_length": 1137.6830749511719, + "epoch": 0.2539018743932492, + "grad_norm": 0.9217676520347595, + "kl": 0.3046875, + "learning_rate": 9.366248510115307e-07, + "loss": 0.0405, + "reward": 1.1584821939468384, + "reward_std": 0.1871834211051464, + "rewards/accuracy_reward": 0.1830357275903225, + "rewards/format_reward": 0.9754464477300644, + "step": 850 + }, + { + "completion_length": 1079.3616790771484, + "epoch": 0.25420058248077065, + "grad_norm": 0.5347990989685059, + "kl": 0.291259765625, + "learning_rate": 9.363844701837692e-07, + "loss": 0.0384, + "reward": 1.1830357313156128, + "reward_std": 0.2209341675043106, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9709821790456772, + "step": 851 + }, + { + "completion_length": 1070.9531707763672, + "epoch": 0.2544992905682921, + "grad_norm": 0.5602172613143921, + "kl": 0.24658203125, + "learning_rate": 9.361436690075739e-07, + "loss": 0.031, + "reward": 1.1495535969734192, + "reward_std": 0.12809659354388714, + "rewards/accuracy_reward": 0.16741071990691125, + "rewards/format_reward": 0.9821428805589676, + "step": 852 + }, + { + "completion_length": 1031.8125305175781, + "epoch": 0.2547979986558136, + "grad_norm": 0.808586835861206, + "kl": 0.233154296875, + "learning_rate": 9.359024477449128e-07, + "loss": 0.0219, + "reward": 1.1919643580913544, + "reward_std": 0.1933046132326126, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9799107611179352, + "step": 853 + }, + { + "completion_length": 1164.9822082519531, + "epoch": 0.25509670674333507, + "grad_norm": 0.40623122453689575, + "kl": 0.32275390625, + "learning_rate": 9.356608066582113e-07, + "loss": 0.0196, + "reward": 1.0937500596046448, + "reward_std": 0.15233944170176983, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.986607164144516, + "step": 854 + }, + { + "completion_length": 1084.2723541259766, + "epoch": 0.25539541483085654, + "grad_norm": 0.6491303443908691, + "kl": 0.25146484375, + "learning_rate": 9.354187460103508e-07, + "loss": 0.0098, + "reward": 1.1049107909202576, + "reward_std": 0.23838946968317032, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9598214626312256, + "step": 855 + }, + { + "completion_length": 1202.83935546875, + "epoch": 0.255694122918378, + "grad_norm": 1.64185631275177, + "kl": 0.329833984375, + "learning_rate": 9.351762660646698e-07, + "loss": 0.0212, + "reward": 0.9977678954601288, + "reward_std": 0.26141420751810074, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.941964328289032, + "step": 856 + }, + { + "completion_length": 1032.3772888183594, + "epoch": 0.2559928310058995, + "grad_norm": 0.5616814494132996, + "kl": 0.21875, + "learning_rate": 9.349333670849628e-07, + "loss": 0.0136, + "reward": 1.111607164144516, + "reward_std": 0.1881246641278267, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.9776786118745804, + "step": 857 + }, + { + "completion_length": 1096.638442993164, + "epoch": 0.25629153909342095, + "grad_norm": 0.6317376494407654, + "kl": 0.19921875, + "learning_rate": 9.346900493354798e-07, + "loss": 0.0151, + "reward": 1.0825893133878708, + "reward_std": 0.18775360099971294, + "rewards/accuracy_reward": 0.12276786053553224, + "rewards/format_reward": 0.9598214775323868, + "step": 858 + }, + { + "completion_length": 1068.9420166015625, + "epoch": 0.2565902471809424, + "grad_norm": 0.7451961636543274, + "kl": 0.236572265625, + "learning_rate": 9.344463130809267e-07, + "loss": 0.0248, + "reward": 1.1227679252624512, + "reward_std": 0.16362351551651955, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.9866071790456772, + "step": 859 + }, + { + "completion_length": 1078.1250762939453, + "epoch": 0.2568889552684639, + "grad_norm": 1.2554875612258911, + "kl": 0.173828125, + "learning_rate": 9.342021585864649e-07, + "loss": 0.021, + "reward": 1.1964286267757416, + "reward_std": 0.22740770131349564, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.973214328289032, + "step": 860 + }, + { + "completion_length": 1213.997802734375, + "epoch": 0.25718766335598536, + "grad_norm": 0.5693915486335754, + "kl": 0.270263671875, + "learning_rate": 9.339575861177103e-07, + "loss": 0.0272, + "reward": 1.1004464626312256, + "reward_std": 0.21751517802476883, + "rewards/accuracy_reward": 0.13392857694998384, + "rewards/format_reward": 0.9665178805589676, + "step": 861 + }, + { + "completion_length": 1009.0246276855469, + "epoch": 0.25748637144350683, + "grad_norm": 1.1865284442901611, + "kl": 0.2265625, + "learning_rate": 9.337125959407341e-07, + "loss": 0.0095, + "reward": 1.1964286267757416, + "reward_std": 0.18243474140763283, + "rewards/accuracy_reward": 0.2209821492433548, + "rewards/format_reward": 0.9754464775323868, + "step": 862 + }, + { + "completion_length": 1084.1763610839844, + "epoch": 0.2577850795310283, + "grad_norm": 0.31609776616096497, + "kl": 0.192626953125, + "learning_rate": 9.33467188322061e-07, + "loss": 0.0245, + "reward": 1.1116072237491608, + "reward_std": 0.14900673646479845, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9776786118745804, + "step": 863 + }, + { + "completion_length": 1092.8995971679688, + "epoch": 0.2580837876185498, + "grad_norm": 0.8070145845413208, + "kl": 0.1787109375, + "learning_rate": 9.332213635286713e-07, + "loss": 0.0318, + "reward": 1.1138392984867096, + "reward_std": 0.16518785990774632, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.973214328289032, + "step": 864 + }, + { + "completion_length": 1201.7165832519531, + "epoch": 0.25838249570607125, + "grad_norm": 0.5686659216880798, + "kl": 0.3388671875, + "learning_rate": 9.329751218279975e-07, + "loss": 0.052, + "reward": 1.0937500596046448, + "reward_std": 0.1801279503852129, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9709821790456772, + "step": 865 + }, + { + "completion_length": 1246.6094665527344, + "epoch": 0.2586812037935927, + "grad_norm": 0.5514845252037048, + "kl": 0.287841796875, + "learning_rate": 9.327284634879269e-07, + "loss": 0.0264, + "reward": 1.131696492433548, + "reward_std": 0.20778976008296013, + "rewards/accuracy_reward": 0.15848215483129025, + "rewards/format_reward": 0.9732143133878708, + "step": 866 + }, + { + "completion_length": 1037.294692993164, + "epoch": 0.2589799118811142, + "grad_norm": 0.45814692974090576, + "kl": 0.1566162109375, + "learning_rate": 9.324813887767993e-07, + "loss": 0.0077, + "reward": 1.2366071939468384, + "reward_std": 0.2092730700969696, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.9866071790456772, + "step": 867 + }, + { + "completion_length": 1094.4241638183594, + "epoch": 0.25927861996863566, + "grad_norm": 0.6438884139060974, + "kl": 0.262939453125, + "learning_rate": 9.322338979634082e-07, + "loss": 0.0367, + "reward": 1.1272321939468384, + "reward_std": 0.2516888678073883, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9687500298023224, + "step": 868 + }, + { + "completion_length": 1129.0938110351562, + "epoch": 0.25957732805615713, + "grad_norm": 0.4894266128540039, + "kl": 0.28271484375, + "learning_rate": 9.319859913169987e-07, + "loss": 0.0315, + "reward": 1.214285746216774, + "reward_std": 0.17986544780433178, + "rewards/accuracy_reward": 0.23437501350417733, + "rewards/format_reward": 0.9799107611179352, + "step": 869 + }, + { + "completion_length": 1126.6719360351562, + "epoch": 0.2598760361436786, + "grad_norm": 0.7696977853775024, + "kl": 0.326171875, + "learning_rate": 9.317376691072694e-07, + "loss": 0.0534, + "reward": 1.0334821790456772, + "reward_std": 0.21658526360988617, + "rewards/accuracy_reward": 0.08258928754366934, + "rewards/format_reward": 0.9508928954601288, + "step": 870 + }, + { + "completion_length": 1178.8705749511719, + "epoch": 0.2601747442312001, + "grad_norm": 0.8065361976623535, + "kl": 0.2314453125, + "learning_rate": 9.314889316043706e-07, + "loss": 0.0437, + "reward": 1.1897321939468384, + "reward_std": 0.2268661893904209, + "rewards/accuracy_reward": 0.2187500149011612, + "rewards/format_reward": 0.9709821939468384, + "step": 871 + }, + { + "completion_length": 1091.370590209961, + "epoch": 0.26047345231872154, + "grad_norm": 0.5234848856925964, + "kl": 0.216796875, + "learning_rate": 9.312397790789039e-07, + "loss": 0.036, + "reward": 1.1004464626312256, + "reward_std": 0.250445693731308, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9709821790456772, + "step": 872 + }, + { + "completion_length": 1113.4754791259766, + "epoch": 0.260772160406243, + "grad_norm": 0.4409147799015045, + "kl": 0.214111328125, + "learning_rate": 9.309902118019233e-07, + "loss": 0.0277, + "reward": 1.084821492433548, + "reward_std": 0.11630143411457539, + "rewards/accuracy_reward": 0.10937500465661287, + "rewards/format_reward": 0.9754464775323868, + "step": 873 + }, + { + "completion_length": 1218.1272583007812, + "epoch": 0.2610708684937645, + "grad_norm": 0.7303836345672607, + "kl": 0.27587890625, + "learning_rate": 9.307402300449332e-07, + "loss": 0.0322, + "reward": 1.0647321939468384, + "reward_std": 0.18556641414761543, + "rewards/accuracy_reward": 0.09375000186264515, + "rewards/format_reward": 0.9709821790456772, + "step": 874 + }, + { + "completion_length": 1122.372802734375, + "epoch": 0.26136957658128596, + "grad_norm": 0.6376983523368835, + "kl": 0.33056640625, + "learning_rate": 9.304898340798894e-07, + "loss": 0.0127, + "reward": 1.167410746216774, + "reward_std": 0.18694846611469984, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9709821790456772, + "step": 875 + }, + { + "completion_length": 1096.966552734375, + "epoch": 0.2616682846688074, + "grad_norm": 0.863745391368866, + "kl": 0.256591796875, + "learning_rate": 9.302390241791981e-07, + "loss": 0.0457, + "reward": 1.1674107611179352, + "reward_std": 0.20416661724448204, + "rewards/accuracy_reward": 0.19196429289877415, + "rewards/format_reward": 0.9754464626312256, + "step": 876 + }, + { + "completion_length": 1157.4933471679688, + "epoch": 0.2619669927563289, + "grad_norm": 1.3270208835601807, + "kl": 0.3837890625, + "learning_rate": 9.299878006157159e-07, + "loss": 0.0635, + "reward": 1.1205357611179352, + "reward_std": 0.2395208589732647, + "rewards/accuracy_reward": 0.17410715483129025, + "rewards/format_reward": 0.9464286118745804, + "step": 877 + }, + { + "completion_length": 1211.357177734375, + "epoch": 0.26226570084385037, + "grad_norm": 0.6554855108261108, + "kl": 0.30517578125, + "learning_rate": 9.297361636627496e-07, + "loss": 0.0248, + "reward": 1.1785714626312256, + "reward_std": 0.24019699543714523, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.9598214626312256, + "step": 878 + }, + { + "completion_length": 1112.0893249511719, + "epoch": 0.26256440893137184, + "grad_norm": 0.6557765603065491, + "kl": 0.3115234375, + "learning_rate": 9.294841135940553e-07, + "loss": 0.0129, + "reward": 1.162946492433548, + "reward_std": 0.14162128511816263, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.973214328289032, + "step": 879 + }, + { + "completion_length": 1210.38623046875, + "epoch": 0.2628631170188933, + "grad_norm": 0.771122932434082, + "kl": 0.4287109375, + "learning_rate": 9.292316506838387e-07, + "loss": 0.0231, + "reward": 1.129464328289032, + "reward_std": 0.28082237020134926, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.955357164144516, + "step": 880 + }, + { + "completion_length": 1151.6562805175781, + "epoch": 0.2631618251064148, + "grad_norm": 0.6840989589691162, + "kl": 0.41845703125, + "learning_rate": 9.289787752067549e-07, + "loss": 0.0183, + "reward": 1.0937500596046448, + "reward_std": 0.2055133767426014, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.9598214626312256, + "step": 881 + }, + { + "completion_length": 1143.5692138671875, + "epoch": 0.26346053319393625, + "grad_norm": 1.015975832939148, + "kl": 0.39990234375, + "learning_rate": 9.287254874379077e-07, + "loss": 0.0036, + "reward": 1.1495536267757416, + "reward_std": 0.17295554839074612, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.9843750149011612, + "step": 882 + }, + { + "completion_length": 1121.4308471679688, + "epoch": 0.26375924128145767, + "grad_norm": 0.9072540998458862, + "kl": 0.34619140625, + "learning_rate": 9.284717876528492e-07, + "loss": 0.0288, + "reward": 1.100446492433548, + "reward_std": 0.2254687286913395, + "rewards/accuracy_reward": 0.12946429383009672, + "rewards/format_reward": 0.9709821790456772, + "step": 883 + }, + { + "completion_length": 1204.3683471679688, + "epoch": 0.26405794936897914, + "grad_norm": 1.1364487409591675, + "kl": 0.355224609375, + "learning_rate": 9.282176761275799e-07, + "loss": 0.0359, + "reward": 1.0446428656578064, + "reward_std": 0.21496228501200676, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.9642857611179352, + "step": 884 + }, + { + "completion_length": 1115.1897735595703, + "epoch": 0.2643566574565006, + "grad_norm": 0.9937188029289246, + "kl": 0.3079833984375, + "learning_rate": 9.279631531385483e-07, + "loss": 0.0361, + "reward": 1.2165179252624512, + "reward_std": 0.1840682588517666, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.9776786118745804, + "step": 885 + }, + { + "completion_length": 1121.8259582519531, + "epoch": 0.2646553655440221, + "grad_norm": 0.8860645890235901, + "kl": 0.232177734375, + "learning_rate": 9.277082189626506e-07, + "loss": 0.0028, + "reward": 1.1674107909202576, + "reward_std": 0.15243254974484444, + "rewards/accuracy_reward": 0.18303572130389512, + "rewards/format_reward": 0.9843750447034836, + "step": 886 + }, + { + "completion_length": 1194.2879943847656, + "epoch": 0.26495407363154355, + "grad_norm": 0.42961108684539795, + "kl": 0.28564453125, + "learning_rate": 9.274528738772299e-07, + "loss": 0.0334, + "reward": 1.1071428954601288, + "reward_std": 0.18612509965896606, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9754464775323868, + "step": 887 + }, + { + "completion_length": 1162.2611999511719, + "epoch": 0.265252781719065, + "grad_norm": 0.5061367750167847, + "kl": 0.243408203125, + "learning_rate": 9.27197118160077e-07, + "loss": 0.0402, + "reward": 1.1339286118745804, + "reward_std": 0.21223367750644684, + "rewards/accuracy_reward": 0.16294643026776612, + "rewards/format_reward": 0.9709821790456772, + "step": 888 + }, + { + "completion_length": 1212.4621276855469, + "epoch": 0.2655514898065865, + "grad_norm": 0.4192088842391968, + "kl": 0.26806640625, + "learning_rate": 9.269409520894285e-07, + "loss": 0.0293, + "reward": 1.0736607611179352, + "reward_std": 0.19683298468589783, + "rewards/accuracy_reward": 0.0937500074505806, + "rewards/format_reward": 0.9799107611179352, + "step": 889 + }, + { + "completion_length": 1175.1942138671875, + "epoch": 0.26585019789410796, + "grad_norm": 0.6604803800582886, + "kl": 0.2578125, + "learning_rate": 9.266843759439685e-07, + "loss": 0.0332, + "reward": 1.1674107909202576, + "reward_std": 0.21583625301718712, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9776786118745804, + "step": 890 + }, + { + "completion_length": 1167.4911193847656, + "epoch": 0.26614890598162944, + "grad_norm": 1.271112322807312, + "kl": 0.285400390625, + "learning_rate": 9.264273900028265e-07, + "loss": 0.0278, + "reward": 1.1763393431901932, + "reward_std": 0.20591986551880836, + "rewards/accuracy_reward": 0.20089286682195961, + "rewards/format_reward": 0.9754464626312256, + "step": 891 + }, + { + "completion_length": 1242.6652221679688, + "epoch": 0.2664476140691509, + "grad_norm": 0.6233854293823242, + "kl": 0.2353515625, + "learning_rate": 9.26169994545578e-07, + "loss": 0.0319, + "reward": 1.162946492433548, + "reward_std": 0.17767296731472015, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9799107611179352, + "step": 892 + }, + { + "completion_length": 1230.16748046875, + "epoch": 0.2667463221566724, + "grad_norm": 0.9778401255607605, + "kl": 0.26806640625, + "learning_rate": 9.259121898522442e-07, + "loss": 0.0153, + "reward": 1.174107164144516, + "reward_std": 0.18587337248027325, + "rewards/accuracy_reward": 0.20089286658912897, + "rewards/format_reward": 0.9732143133878708, + "step": 893 + }, + { + "completion_length": 1205.0536499023438, + "epoch": 0.26704503024419385, + "grad_norm": 1.1040855646133423, + "kl": 0.253662109375, + "learning_rate": 9.256539762032909e-07, + "loss": 0.0186, + "reward": 1.0625000447034836, + "reward_std": 0.1765753412619233, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.9709821790456772, + "step": 894 + }, + { + "completion_length": 1255.8504943847656, + "epoch": 0.2673437383317153, + "grad_norm": 0.8417803645133972, + "kl": 0.232666015625, + "learning_rate": 9.253953538796299e-07, + "loss": 0.0196, + "reward": 1.1272321939468384, + "reward_std": 0.1739343348890543, + "rewards/accuracy_reward": 0.14062501047737896, + "rewards/format_reward": 0.986607164144516, + "step": 895 + }, + { + "completion_length": 1232.9375610351562, + "epoch": 0.2676424464192368, + "grad_norm": 1.2309322357177734, + "kl": 0.259033203125, + "learning_rate": 9.251363231626161e-07, + "loss": 0.0277, + "reward": 1.1875000298023224, + "reward_std": 0.2462448664009571, + "rewards/accuracy_reward": 0.2187500074505806, + "rewards/format_reward": 0.9687500298023224, + "step": 896 + }, + { + "completion_length": 1180.6295166015625, + "epoch": 0.26794115450675826, + "grad_norm": 0.7253634333610535, + "kl": 0.247314453125, + "learning_rate": 9.248768843340505e-07, + "loss": 0.0199, + "reward": 1.098214328289032, + "reward_std": 0.15551801584661007, + "rewards/accuracy_reward": 0.1272321513388306, + "rewards/format_reward": 0.9709821939468384, + "step": 897 + }, + { + "completion_length": 1201.3995971679688, + "epoch": 0.26823986259427973, + "grad_norm": 0.911368727684021, + "kl": 0.232666015625, + "learning_rate": 9.246170376761763e-07, + "loss": 0.0359, + "reward": 1.1049107611179352, + "reward_std": 0.18768258020281792, + "rewards/accuracy_reward": 0.13169643096625805, + "rewards/format_reward": 0.973214328289032, + "step": 898 + }, + { + "completion_length": 1106.0513916015625, + "epoch": 0.2685385706818012, + "grad_norm": 0.9927940964698792, + "kl": 0.28662109375, + "learning_rate": 9.243567834716818e-07, + "loss": 0.0282, + "reward": 1.2165179252624512, + "reward_std": 0.23732930794358253, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.9776786118745804, + "step": 899 + }, + { + "completion_length": 1200.950927734375, + "epoch": 0.2688372787693227, + "grad_norm": 1.1855136156082153, + "kl": 0.33349609375, + "learning_rate": 9.240961220036976e-07, + "loss": 0.0368, + "reward": 1.0892857611179352, + "reward_std": 0.2939358353614807, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9553571790456772, + "step": 900 + }, + { + "completion_length": 1190.8438415527344, + "epoch": 0.26913598685684414, + "grad_norm": 1.4852875471115112, + "kl": 0.314208984375, + "learning_rate": 9.23835053555798e-07, + "loss": 0.0612, + "reward": 1.1406250596046448, + "reward_std": 0.261878315359354, + "rewards/accuracy_reward": 0.17633929289877415, + "rewards/format_reward": 0.9642857611179352, + "step": 901 + }, + { + "completion_length": 1150.3526916503906, + "epoch": 0.2694346949443656, + "grad_norm": 1.0805004835128784, + "kl": 0.45166015625, + "learning_rate": 9.23573578412e-07, + "loss": 0.0252, + "reward": 1.0691964626312256, + "reward_std": 0.2142174355685711, + "rewards/accuracy_reward": 0.09151786379516125, + "rewards/format_reward": 0.9776786118745804, + "step": 902 + }, + { + "completion_length": 1219.0781555175781, + "epoch": 0.2697334030318871, + "grad_norm": 0.8841997981071472, + "kl": 0.51806640625, + "learning_rate": 9.233116968567627e-07, + "loss": 0.0598, + "reward": 1.0892857611179352, + "reward_std": 0.25847847387194633, + "rewards/accuracy_reward": 0.1406250111758709, + "rewards/format_reward": 0.9486607611179352, + "step": 903 + }, + { + "completion_length": 1128.7433471679688, + "epoch": 0.27003211111940856, + "grad_norm": 0.838213324546814, + "kl": 0.4892578125, + "learning_rate": 9.230494091749879e-07, + "loss": 0.0608, + "reward": 1.1339286267757416, + "reward_std": 0.20021480694413185, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9620535969734192, + "step": 904 + }, + { + "completion_length": 1148.0714569091797, + "epoch": 0.27033081920693003, + "grad_norm": 0.6323837637901306, + "kl": 0.55517578125, + "learning_rate": 9.227867156520186e-07, + "loss": 0.0691, + "reward": 1.037946492433548, + "reward_std": 0.1958424672484398, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.9687500447034836, + "step": 905 + }, + { + "completion_length": 1206.4888610839844, + "epoch": 0.2706295272944515, + "grad_norm": 0.944372832775116, + "kl": 0.54248046875, + "learning_rate": 9.225236165736395e-07, + "loss": 0.0644, + "reward": 1.1093750298023224, + "reward_std": 0.23502830415964127, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.9620536118745804, + "step": 906 + }, + { + "completion_length": 1160.77685546875, + "epoch": 0.27092823538197297, + "grad_norm": 1.3466863632202148, + "kl": 0.61669921875, + "learning_rate": 9.222601122260771e-07, + "loss": 0.0647, + "reward": 1.0491072237491608, + "reward_std": 0.20598071068525314, + "rewards/accuracy_reward": 0.10491071944124997, + "rewards/format_reward": 0.9441964775323868, + "step": 907 + }, + { + "completion_length": 1126.4130249023438, + "epoch": 0.27122694346949444, + "grad_norm": 0.5967773795127869, + "kl": 0.5400390625, + "learning_rate": 9.219962028959978e-07, + "loss": 0.0509, + "reward": 1.1250000298023224, + "reward_std": 0.20626122877001762, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.973214328289032, + "step": 908 + }, + { + "completion_length": 1141.0067749023438, + "epoch": 0.2715256515570159, + "grad_norm": 0.753538966178894, + "kl": 0.5419921875, + "learning_rate": 9.217318888705094e-07, + "loss": 0.0668, + "reward": 1.1383928954601288, + "reward_std": 0.26916617900133133, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.9665178954601288, + "step": 909 + }, + { + "completion_length": 1181.3170166015625, + "epoch": 0.2718243596445374, + "grad_norm": 0.5133890509605408, + "kl": 0.49609375, + "learning_rate": 9.214671704371596e-07, + "loss": 0.0196, + "reward": 1.0825893580913544, + "reward_std": 0.313679289072752, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.926339328289032, + "step": 910 + }, + { + "completion_length": 1053.1451416015625, + "epoch": 0.27212306773205885, + "grad_norm": 1.0550130605697632, + "kl": 0.3291015625, + "learning_rate": 9.212020478839359e-07, + "loss": 0.0507, + "reward": 1.1718750596046448, + "reward_std": 0.27512160688638687, + "rewards/accuracy_reward": 0.20758929289877415, + "rewards/format_reward": 0.9642857760190964, + "step": 911 + }, + { + "completion_length": 1216.0870971679688, + "epoch": 0.2724217758195803, + "grad_norm": 0.6732624769210815, + "kl": 0.50732421875, + "learning_rate": 9.20936521499266e-07, + "loss": 0.0327, + "reward": 1.0959822237491608, + "reward_std": 0.28772853314876556, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.941964328289032, + "step": 912 + }, + { + "completion_length": 1142.9174499511719, + "epoch": 0.2727204839071018, + "grad_norm": 0.6471587419509888, + "kl": 0.453125, + "learning_rate": 9.206705915720162e-07, + "loss": 0.0628, + "reward": 1.100446492433548, + "reward_std": 0.2162456214427948, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.9598214626312256, + "step": 913 + }, + { + "completion_length": 1242.9263916015625, + "epoch": 0.27301919199462327, + "grad_norm": 0.6449885368347168, + "kl": 0.43359375, + "learning_rate": 9.204042583914925e-07, + "loss": 0.0181, + "reward": 1.1160714328289032, + "reward_std": 0.20591357350349426, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.9754464626312256, + "step": 914 + }, + { + "completion_length": 1166.4688110351562, + "epoch": 0.27331790008214474, + "grad_norm": 0.961670994758606, + "kl": 0.41748046875, + "learning_rate": 9.201375222474392e-07, + "loss": 0.053, + "reward": 1.0513393431901932, + "reward_std": 0.24984139204025269, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.9375000447034836, + "step": 915 + }, + { + "completion_length": 1266.8080749511719, + "epoch": 0.2736166081696662, + "grad_norm": 0.5493215918540955, + "kl": 0.37451171875, + "learning_rate": 9.198703834300391e-07, + "loss": 0.0456, + "reward": 1.1406250298023224, + "reward_std": 0.24954935908317566, + "rewards/accuracy_reward": 0.18526786379516125, + "rewards/format_reward": 0.9553571790456772, + "step": 916 + }, + { + "completion_length": 1151.3170166015625, + "epoch": 0.2739153162571877, + "grad_norm": 0.5444647669792175, + "kl": 0.421875, + "learning_rate": 9.19602842229913e-07, + "loss": 0.0603, + "reward": 1.1250000596046448, + "reward_std": 0.21346972230821848, + "rewards/accuracy_reward": 0.15848215227015316, + "rewards/format_reward": 0.9665178954601288, + "step": 917 + }, + { + "completion_length": 1121.3772735595703, + "epoch": 0.27421402434470915, + "grad_norm": 1.0496227741241455, + "kl": 0.39404296875, + "learning_rate": 9.193348989381196e-07, + "loss": 0.065, + "reward": 1.0535714775323868, + "reward_std": 0.19896399602293968, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.9642857611179352, + "step": 918 + }, + { + "completion_length": 1115.404052734375, + "epoch": 0.2745127324322306, + "grad_norm": 0.5310172438621521, + "kl": 0.37890625, + "learning_rate": 9.190665538461546e-07, + "loss": 0.0303, + "reward": 1.176339328289032, + "reward_std": 0.2390994019806385, + "rewards/accuracy_reward": 0.20535715762525797, + "rewards/format_reward": 0.970982164144516, + "step": 919 + }, + { + "completion_length": 1174.7857666015625, + "epoch": 0.2748114405197521, + "grad_norm": 0.9168079495429993, + "kl": 0.3984375, + "learning_rate": 9.187978072459517e-07, + "loss": 0.0648, + "reward": 1.2232143580913544, + "reward_std": 0.21722031012177467, + "rewards/accuracy_reward": 0.2522321566939354, + "rewards/format_reward": 0.9709821790456772, + "step": 920 + }, + { + "completion_length": 1207.732177734375, + "epoch": 0.27511014860727356, + "grad_norm": 0.5479586720466614, + "kl": 0.4580078125, + "learning_rate": 9.185286594298804e-07, + "loss": 0.0543, + "reward": 1.069196492433548, + "reward_std": 0.23119354993104935, + "rewards/accuracy_reward": 0.13169643096625805, + "rewards/format_reward": 0.9375000298023224, + "step": 921 + }, + { + "completion_length": 1179.7969055175781, + "epoch": 0.27540885669479503, + "grad_norm": 0.5391339063644409, + "kl": 0.388671875, + "learning_rate": 9.182591106907474e-07, + "loss": 0.0256, + "reward": 1.0825893580913544, + "reward_std": 0.27376485243439674, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.95089291036129, + "step": 922 + }, + { + "completion_length": 1070.857177734375, + "epoch": 0.2757075647823165, + "grad_norm": 0.9653404355049133, + "kl": 0.302001953125, + "learning_rate": 9.179891613217953e-07, + "loss": 0.0144, + "reward": 1.1183036267757416, + "reward_std": 0.1776508241891861, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.9665178954601288, + "step": 923 + }, + { + "completion_length": 1255.7812805175781, + "epoch": 0.276006272869838, + "grad_norm": 1.0233113765716553, + "kl": 0.439453125, + "learning_rate": 9.177188116167025e-07, + "loss": 0.0546, + "reward": 1.1160714775323868, + "reward_std": 0.24031908437609673, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.9486607611179352, + "step": 924 + }, + { + "completion_length": 1052.6764068603516, + "epoch": 0.27630498095735945, + "grad_norm": 0.5812583565711975, + "kl": 0.377685546875, + "learning_rate": 9.174480618695829e-07, + "loss": 0.0473, + "reward": 1.0089286416769028, + "reward_std": 0.21465561538934708, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.9553571790456772, + "step": 925 + }, + { + "completion_length": 1159.2902526855469, + "epoch": 0.27660368904488086, + "grad_norm": 0.8282712697982788, + "kl": 0.3662109375, + "learning_rate": 9.171769123749857e-07, + "loss": 0.0145, + "reward": 1.0781250447034836, + "reward_std": 0.21245884895324707, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.9620536118745804, + "step": 926 + }, + { + "completion_length": 1056.6473999023438, + "epoch": 0.27690239713240233, + "grad_norm": 0.3392133414745331, + "kl": 0.35009765625, + "learning_rate": 9.169053634278952e-07, + "loss": 0.0369, + "reward": 1.0915179252624512, + "reward_std": 0.1814526580274105, + "rewards/accuracy_reward": 0.10937500861473382, + "rewards/format_reward": 0.98214291036129, + "step": 927 + }, + { + "completion_length": 973.4107666015625, + "epoch": 0.2772011052199238, + "grad_norm": 0.6546569466590881, + "kl": 0.228515625, + "learning_rate": 9.166334153237298e-07, + "loss": 0.0391, + "reward": 1.2031250596046448, + "reward_std": 0.24120676890015602, + "rewards/accuracy_reward": 0.2366071566939354, + "rewards/format_reward": 0.9665178954601288, + "step": 928 + }, + { + "completion_length": 1183.8281860351562, + "epoch": 0.2774998133074453, + "grad_norm": 0.6545471549034119, + "kl": 0.39697265625, + "learning_rate": 9.163610683583426e-07, + "loss": 0.0446, + "reward": 1.0267857313156128, + "reward_std": 0.15965971909463406, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.9665178954601288, + "step": 929 + }, + { + "completion_length": 1087.6139068603516, + "epoch": 0.27779852139496675, + "grad_norm": 0.6167736649513245, + "kl": 0.26318359375, + "learning_rate": 9.1608832282802e-07, + "loss": 0.0391, + "reward": 1.1473214626312256, + "reward_std": 0.20837099850177765, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.9665178954601288, + "step": 930 + }, + { + "completion_length": 1080.8527374267578, + "epoch": 0.2780972294824882, + "grad_norm": 0.5274073481559753, + "kl": 0.365234375, + "learning_rate": 9.158151790294828e-07, + "loss": 0.0158, + "reward": 1.1339286118745804, + "reward_std": 0.2437591552734375, + "rewards/accuracy_reward": 0.17857143888249993, + "rewards/format_reward": 0.9553571939468384, + "step": 931 + }, + { + "completion_length": 1149.9330749511719, + "epoch": 0.2783959375700097, + "grad_norm": 0.5835480093955994, + "kl": 0.39794921875, + "learning_rate": 9.155416372598847e-07, + "loss": 0.0257, + "reward": 1.0602679252624512, + "reward_std": 0.21113171242177486, + "rewards/accuracy_reward": 0.09151786239817739, + "rewards/format_reward": 0.9687500447034836, + "step": 932 + }, + { + "completion_length": 1181.7411193847656, + "epoch": 0.27869464565753116, + "grad_norm": 0.7679927945137024, + "kl": 0.349609375, + "learning_rate": 9.152676978168121e-07, + "loss": 0.0291, + "reward": 1.1562500894069672, + "reward_std": 0.23991616070270538, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9620536118745804, + "step": 933 + }, + { + "completion_length": 1156.388442993164, + "epoch": 0.27899335374505263, + "grad_norm": 0.7289705872535706, + "kl": 0.27099609375, + "learning_rate": 9.149933609982843e-07, + "loss": 0.0369, + "reward": 1.1674107760190964, + "reward_std": 0.14561800193041563, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.9799107611179352, + "step": 934 + }, + { + "completion_length": 1028.1920166015625, + "epoch": 0.2792920618325741, + "grad_norm": 0.9587408304214478, + "kl": 0.203369140625, + "learning_rate": 9.14718627102753e-07, + "loss": 0.0633, + "reward": 1.1227678656578064, + "reward_std": 0.24006244540214539, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.9709821790456772, + "step": 935 + }, + { + "completion_length": 1114.4465026855469, + "epoch": 0.27959076992009557, + "grad_norm": 1.0410571098327637, + "kl": 0.295166015625, + "learning_rate": 9.144434964291017e-07, + "loss": 0.066, + "reward": 1.191964328289032, + "reward_std": 0.23044020682573318, + "rewards/accuracy_reward": 0.2187500037252903, + "rewards/format_reward": 0.973214328289032, + "step": 936 + }, + { + "completion_length": 1098.5335540771484, + "epoch": 0.27988947800761704, + "grad_norm": 0.5787912607192993, + "kl": 0.2724609375, + "learning_rate": 9.141679692766453e-07, + "loss": 0.0405, + "reward": 1.1316964626312256, + "reward_std": 0.22812709212303162, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.9732143431901932, + "step": 937 + }, + { + "completion_length": 1037.982177734375, + "epoch": 0.2801881860951385, + "grad_norm": 0.5167365670204163, + "kl": 0.32177734375, + "learning_rate": 9.138920459451309e-07, + "loss": 0.0244, + "reward": 1.1674107611179352, + "reward_std": 0.2009208844974637, + "rewards/accuracy_reward": 0.191964291036129, + "rewards/format_reward": 0.9754464626312256, + "step": 938 + }, + { + "completion_length": 1029.3214874267578, + "epoch": 0.28048689418266, + "grad_norm": 0.49570053815841675, + "kl": 0.351318359375, + "learning_rate": 9.136157267347358e-07, + "loss": 0.0247, + "reward": 1.0357143431901932, + "reward_std": 0.19827282056212425, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.9620536118745804, + "step": 939 + }, + { + "completion_length": 1016.7299652099609, + "epoch": 0.28078560227018146, + "grad_norm": 0.5890031456947327, + "kl": 0.34375, + "learning_rate": 9.133390119460681e-07, + "loss": 0.0639, + "reward": 1.1116071939468384, + "reward_std": 0.21990188211202621, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.9687500447034836, + "step": 940 + }, + { + "completion_length": 1006.9933471679688, + "epoch": 0.2810843103577029, + "grad_norm": 0.8667070865631104, + "kl": 0.41943359375, + "learning_rate": 9.130619018801664e-07, + "loss": 0.017, + "reward": 1.1718750596046448, + "reward_std": 0.19914056919515133, + "rewards/accuracy_reward": 0.20089286053553224, + "rewards/format_reward": 0.9709821939468384, + "step": 941 + }, + { + "completion_length": 1012.3036193847656, + "epoch": 0.2813830184452244, + "grad_norm": 0.8025014400482178, + "kl": 0.3955078125, + "learning_rate": 9.127843968384994e-07, + "loss": 0.0143, + "reward": 1.1026785969734192, + "reward_std": 0.18664126470685005, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9754464775323868, + "step": 942 + }, + { + "completion_length": 990.1496124267578, + "epoch": 0.28168172653274587, + "grad_norm": 0.7788614630699158, + "kl": 0.38720703125, + "learning_rate": 9.125064971229654e-07, + "loss": 0.0316, + "reward": 1.116071492433548, + "reward_std": 0.19843996688723564, + "rewards/accuracy_reward": 0.13616072107106447, + "rewards/format_reward": 0.9799107611179352, + "step": 943 + }, + { + "completion_length": 1059.3281860351562, + "epoch": 0.28198043462026734, + "grad_norm": 1.0021851062774658, + "kl": 0.4453125, + "learning_rate": 9.122282030358918e-07, + "loss": 0.046, + "reward": 1.0691964626312256, + "reward_std": 0.17127872817218304, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.9866071790456772, + "step": 944 + }, + { + "completion_length": 1006.3326263427734, + "epoch": 0.2822791427077888, + "grad_norm": 5.415305137634277, + "kl": 0.37451171875, + "learning_rate": 9.119495148800357e-07, + "loss": 0.0144, + "reward": 1.084821492433548, + "reward_std": 0.155256699770689, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.9821428954601288, + "step": 945 + }, + { + "completion_length": 1052.2611846923828, + "epoch": 0.2825778507953103, + "grad_norm": 0.4041845500469208, + "kl": 0.320068359375, + "learning_rate": 9.116704329585822e-07, + "loss": 0.0222, + "reward": 1.0892857611179352, + "reward_std": 0.18608811125159264, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.9776786118745804, + "step": 946 + }, + { + "completion_length": 1011.6094207763672, + "epoch": 0.28287655888283175, + "grad_norm": 0.46088600158691406, + "kl": 0.2412109375, + "learning_rate": 9.11390957575145e-07, + "loss": 0.0309, + "reward": 1.1049107909202576, + "reward_std": 0.1660152804106474, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.9843750298023224, + "step": 947 + }, + { + "completion_length": 1197.6652526855469, + "epoch": 0.2831752669703532, + "grad_norm": 0.6392695307731628, + "kl": 0.398193359375, + "learning_rate": 9.111110890337661e-07, + "loss": 0.0649, + "reward": 1.1339286267757416, + "reward_std": 0.17605983931571245, + "rewards/accuracy_reward": 0.16294643771834671, + "rewards/format_reward": 0.970982164144516, + "step": 948 + }, + { + "completion_length": 1087.3594360351562, + "epoch": 0.2834739750578747, + "grad_norm": 0.6380780339241028, + "kl": 0.314697265625, + "learning_rate": 9.108308276389152e-07, + "loss": 0.0502, + "reward": 1.0089286118745804, + "reward_std": 0.15219878032803535, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.9687500447034836, + "step": 949 + }, + { + "completion_length": 1136.8058471679688, + "epoch": 0.28377268314539617, + "grad_norm": 0.7428329586982727, + "kl": 0.28271484375, + "learning_rate": 9.105501736954889e-07, + "loss": 0.0443, + "reward": 1.0937500298023224, + "reward_std": 0.18026357516646385, + "rewards/accuracy_reward": 0.12946429150179029, + "rewards/format_reward": 0.9642857760190964, + "step": 950 + }, + { + "completion_length": 1024.1027221679688, + "epoch": 0.28407139123291764, + "grad_norm": 0.27586042881011963, + "kl": 0.218505859375, + "learning_rate": 9.102691275088115e-07, + "loss": 0.0253, + "reward": 1.1964286267757416, + "reward_std": 0.21434246376156807, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9843750447034836, + "step": 951 + }, + { + "completion_length": 1070.1808776855469, + "epoch": 0.2843700993204391, + "grad_norm": 0.5929885506629944, + "kl": 0.28173828125, + "learning_rate": 9.099876893846333e-07, + "loss": 0.0279, + "reward": 1.0736607313156128, + "reward_std": 0.178789421916008, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.9776785969734192, + "step": 952 + }, + { + "completion_length": 955.4665832519531, + "epoch": 0.2846688074079606, + "grad_norm": 0.6405013799667358, + "kl": 0.214111328125, + "learning_rate": 9.097058596291319e-07, + "loss": 0.0332, + "reward": 1.209821492433548, + "reward_std": 0.1932855360209942, + "rewards/accuracy_reward": 0.22544643376022577, + "rewards/format_reward": 0.9843750298023224, + "step": 953 + }, + { + "completion_length": 1081.2879943847656, + "epoch": 0.28496751549548205, + "grad_norm": 0.8920402526855469, + "kl": 0.34765625, + "learning_rate": 9.0942363854891e-07, + "loss": 0.0286, + "reward": 1.1562500596046448, + "reward_std": 0.30734623223543167, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.9665178954601288, + "step": 954 + }, + { + "completion_length": 1027.417465209961, + "epoch": 0.2852662235830035, + "grad_norm": 0.4490554928779602, + "kl": 0.292724609375, + "learning_rate": 9.091410264509968e-07, + "loss": 0.0113, + "reward": 1.1183035969734192, + "reward_std": 0.2135031893849373, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9687500298023224, + "step": 955 + }, + { + "completion_length": 1155.2277221679688, + "epoch": 0.285564931670525, + "grad_norm": 0.5174396634101868, + "kl": 0.289306640625, + "learning_rate": 9.088580236428463e-07, + "loss": 0.0297, + "reward": 1.1406250596046448, + "reward_std": 0.2305765524506569, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.964285746216774, + "step": 956 + }, + { + "completion_length": 1061.0670318603516, + "epoch": 0.28586363975804646, + "grad_norm": 0.6031702756881714, + "kl": 0.294189453125, + "learning_rate": 9.085746304323381e-07, + "loss": 0.0462, + "reward": 1.0892857760190964, + "reward_std": 0.22275298461318016, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.96651791036129, + "step": 957 + }, + { + "completion_length": 1104.7544860839844, + "epoch": 0.28616234784556793, + "grad_norm": 0.5191638469696045, + "kl": 0.254638671875, + "learning_rate": 9.082908471277761e-07, + "loss": 0.0144, + "reward": 1.1071429252624512, + "reward_std": 0.2251630686223507, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9598214775323868, + "step": 958 + }, + { + "completion_length": 1118.0670166015625, + "epoch": 0.2864610559330894, + "grad_norm": 0.43494078516960144, + "kl": 0.27001953125, + "learning_rate": 9.080066740378884e-07, + "loss": 0.0206, + "reward": 1.176339328289032, + "reward_std": 0.17874165065586567, + "rewards/accuracy_reward": 0.2098214328289032, + "rewards/format_reward": 0.9665178954601288, + "step": 959 + }, + { + "completion_length": 1021.9822235107422, + "epoch": 0.2867597640206109, + "grad_norm": 0.6154091358184814, + "kl": 0.323486328125, + "learning_rate": 9.077221114718279e-07, + "loss": 0.0538, + "reward": 1.0937500596046448, + "reward_std": 0.2235293835401535, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9464286118745804, + "step": 960 + }, + { + "completion_length": 1153.2567749023438, + "epoch": 0.28705847210813235, + "grad_norm": 0.667872965335846, + "kl": 0.345703125, + "learning_rate": 9.074371597391708e-07, + "loss": 0.019, + "reward": 1.1026786118745804, + "reward_std": 0.29034487158060074, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9531250298023224, + "step": 961 + }, + { + "completion_length": 1098.325927734375, + "epoch": 0.2873571801956538, + "grad_norm": 0.4358804523944855, + "kl": 0.32080078125, + "learning_rate": 9.071518191499164e-07, + "loss": 0.0454, + "reward": 1.1875000596046448, + "reward_std": 0.2535649724304676, + "rewards/accuracy_reward": 0.22767858766019344, + "rewards/format_reward": 0.9598214775323868, + "step": 962 + }, + { + "completion_length": 1058.2835388183594, + "epoch": 0.2876558882831753, + "grad_norm": 0.41142159700393677, + "kl": 0.244140625, + "learning_rate": 9.068660900144874e-07, + "loss": 0.0199, + "reward": 1.178571492433548, + "reward_std": 0.19525780901312828, + "rewards/accuracy_reward": 0.19866072572767735, + "rewards/format_reward": 0.979910746216774, + "step": 963 + }, + { + "completion_length": 1120.9755249023438, + "epoch": 0.28795459637069676, + "grad_norm": 0.49775728583335876, + "kl": 0.3359375, + "learning_rate": 9.065799726437291e-07, + "loss": 0.0415, + "reward": 1.0781250894069672, + "reward_std": 0.21476422995328903, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9553571790456772, + "step": 964 + }, + { + "completion_length": 1161.8929138183594, + "epoch": 0.28825330445821823, + "grad_norm": 0.5020366311073303, + "kl": 0.3896484375, + "learning_rate": 9.062934673489091e-07, + "loss": 0.0581, + "reward": 1.051339328289032, + "reward_std": 0.20162386819720268, + "rewards/accuracy_reward": 0.09598215017467737, + "rewards/format_reward": 0.9553571790456772, + "step": 965 + }, + { + "completion_length": 1104.464340209961, + "epoch": 0.2885520125457397, + "grad_norm": 0.5375080704689026, + "kl": 0.267333984375, + "learning_rate": 9.060065744417172e-07, + "loss": 0.0252, + "reward": 1.1852678954601288, + "reward_std": 0.21180802956223488, + "rewards/accuracy_reward": 0.21651786286383867, + "rewards/format_reward": 0.9687500447034836, + "step": 966 + }, + { + "completion_length": 1185.47998046875, + "epoch": 0.28885072063326117, + "grad_norm": 0.9345585703849792, + "kl": 0.298828125, + "learning_rate": 9.057192942342647e-07, + "loss": 0.0442, + "reward": 1.0959821939468384, + "reward_std": 0.2811482958495617, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.9531250298023224, + "step": 967 + }, + { + "completion_length": 1158.8661499023438, + "epoch": 0.28914942872078264, + "grad_norm": 0.8771857619285583, + "kl": 0.41748046875, + "learning_rate": 9.054316270390844e-07, + "loss": 0.0668, + "reward": 1.102678656578064, + "reward_std": 0.24416960030794144, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.9620536118745804, + "step": 968 + }, + { + "completion_length": 1194.2410888671875, + "epoch": 0.28944813680830406, + "grad_norm": 1.3645786046981812, + "kl": 0.6455078125, + "learning_rate": 9.051435731691299e-07, + "loss": 0.105, + "reward": 1.0937500298023224, + "reward_std": 0.27756574377417564, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.9508928954601288, + "step": 969 + }, + { + "completion_length": 1142.8348693847656, + "epoch": 0.28974684489582553, + "grad_norm": 0.85869961977005, + "kl": 0.4345703125, + "learning_rate": 9.048551329377755e-07, + "loss": 0.043, + "reward": 1.13839291036129, + "reward_std": 0.2919165603816509, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.957589328289032, + "step": 970 + }, + { + "completion_length": 1112.794677734375, + "epoch": 0.290045552983347, + "grad_norm": 0.5084105134010315, + "kl": 0.409912109375, + "learning_rate": 9.04566306658816e-07, + "loss": 0.0339, + "reward": 1.2366072237491608, + "reward_std": 0.24805452674627304, + "rewards/accuracy_reward": 0.2656250149011612, + "rewards/format_reward": 0.9709821939468384, + "step": 971 + }, + { + "completion_length": 1151.1004943847656, + "epoch": 0.29034426107086847, + "grad_norm": 2.1754398345947266, + "kl": 0.638671875, + "learning_rate": 9.042770946464662e-07, + "loss": 0.0319, + "reward": 1.0290179252624512, + "reward_std": 0.2075769193470478, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.9665178954601288, + "step": 972 + }, + { + "completion_length": 1003.5491485595703, + "epoch": 0.29064296915838994, + "grad_norm": 1.2278785705566406, + "kl": 0.50537109375, + "learning_rate": 9.039874972153604e-07, + "loss": 0.0187, + "reward": 1.0758928954601288, + "reward_std": 0.22065702825784683, + "rewards/accuracy_reward": 0.10937500279396772, + "rewards/format_reward": 0.9665178954601288, + "step": 973 + }, + { + "completion_length": 1193.2611999511719, + "epoch": 0.2909416772459114, + "grad_norm": 0.5586031675338745, + "kl": 0.42138671875, + "learning_rate": 9.036975146805519e-07, + "loss": 0.043, + "reward": 1.1517857909202576, + "reward_std": 0.20125404000282288, + "rewards/accuracy_reward": 0.1852678693830967, + "rewards/format_reward": 0.9665178805589676, + "step": 974 + }, + { + "completion_length": 1074.997817993164, + "epoch": 0.2912403853334329, + "grad_norm": 0.5471972227096558, + "kl": 0.3916015625, + "learning_rate": 9.034071473575136e-07, + "loss": 0.011, + "reward": 1.145089328289032, + "reward_std": 0.18176092952489853, + "rewards/accuracy_reward": 0.17410715017467737, + "rewards/format_reward": 0.9709821790456772, + "step": 975 + }, + { + "completion_length": 1093.7701416015625, + "epoch": 0.29153909342095435, + "grad_norm": 1.035221815109253, + "kl": 0.39404296875, + "learning_rate": 9.031163955621365e-07, + "loss": 0.0638, + "reward": 1.1205357611179352, + "reward_std": 0.2390349954366684, + "rewards/accuracy_reward": 0.1473214328289032, + "rewards/format_reward": 0.973214328289032, + "step": 976 + }, + { + "completion_length": 1160.4129943847656, + "epoch": 0.2918378015084758, + "grad_norm": 0.722131609916687, + "kl": 0.3828125, + "learning_rate": 9.028252596107303e-07, + "loss": 0.0614, + "reward": 1.1049107611179352, + "reward_std": 0.23190094158053398, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.9687500447034836, + "step": 977 + }, + { + "completion_length": 1104.1719360351562, + "epoch": 0.2921365095959973, + "grad_norm": 0.5815181732177734, + "kl": 0.42041015625, + "learning_rate": 9.025337398200223e-07, + "loss": 0.0471, + "reward": 1.1250000596046448, + "reward_std": 0.15568062104284763, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.9754464626312256, + "step": 978 + }, + { + "completion_length": 1181.7009582519531, + "epoch": 0.29243521768351877, + "grad_norm": 0.5239471197128296, + "kl": 0.44970703125, + "learning_rate": 9.022418365071572e-07, + "loss": 0.0443, + "reward": 1.1517857909202576, + "reward_std": 0.25138669833540916, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.957589328289032, + "step": 979 + }, + { + "completion_length": 1166.841552734375, + "epoch": 0.29273392577104024, + "grad_norm": 0.49589958786964417, + "kl": 0.4853515625, + "learning_rate": 9.019495499896975e-07, + "loss": 0.0561, + "reward": 1.0468750596046448, + "reward_std": 0.2332757655531168, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.941964328289032, + "step": 980 + }, + { + "completion_length": 1144.8504943847656, + "epoch": 0.2930326338585617, + "grad_norm": 0.6818940043449402, + "kl": 0.29541015625, + "learning_rate": 9.016568805856222e-07, + "loss": 0.019, + "reward": 1.1629464626312256, + "reward_std": 0.23324786499142647, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9665178954601288, + "step": 981 + }, + { + "completion_length": 1186.3192443847656, + "epoch": 0.2933313419460832, + "grad_norm": 0.5747969150543213, + "kl": 0.46728515625, + "learning_rate": 9.013638286133269e-07, + "loss": 0.0554, + "reward": 1.0446429252624512, + "reward_std": 0.203248493373394, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.9575893133878708, + "step": 982 + }, + { + "completion_length": 1129.5022888183594, + "epoch": 0.29363005003360465, + "grad_norm": 0.6331751942634583, + "kl": 0.41796875, + "learning_rate": 9.010703943916233e-07, + "loss": 0.04, + "reward": 1.1339286267757416, + "reward_std": 0.20241139456629753, + "rewards/accuracy_reward": 0.16741072316654027, + "rewards/format_reward": 0.96651791036129, + "step": 983 + }, + { + "completion_length": 1143.7411193847656, + "epoch": 0.2939287581211261, + "grad_norm": 0.5862615704536438, + "kl": 0.315673828125, + "learning_rate": 9.007765782397393e-07, + "loss": 0.0224, + "reward": 1.1383929252624512, + "reward_std": 0.24863410741090775, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9709821790456772, + "step": 984 + }, + { + "completion_length": 1243.5893249511719, + "epoch": 0.2942274662086476, + "grad_norm": 0.6872965097427368, + "kl": 0.38330078125, + "learning_rate": 9.004823804773179e-07, + "loss": 0.0399, + "reward": 1.116071492433548, + "reward_std": 0.18060174956917763, + "rewards/accuracy_reward": 0.13839286053553224, + "rewards/format_reward": 0.9776785969734192, + "step": 985 + }, + { + "completion_length": 1117.4197082519531, + "epoch": 0.29452617429616906, + "grad_norm": 0.36143553256988525, + "kl": 0.253662109375, + "learning_rate": 9.001878014244175e-07, + "loss": 0.0293, + "reward": 1.1875000298023224, + "reward_std": 0.20917291194200516, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.964285746216774, + "step": 986 + }, + { + "completion_length": 1163.9554138183594, + "epoch": 0.29482488238369053, + "grad_norm": 0.7751055955886841, + "kl": 0.30712890625, + "learning_rate": 8.998928414015113e-07, + "loss": 0.0546, + "reward": 1.1875000596046448, + "reward_std": 0.27013761922717094, + "rewards/accuracy_reward": 0.2254464402794838, + "rewards/format_reward": 0.9620536118745804, + "step": 987 + }, + { + "completion_length": 1134.5067443847656, + "epoch": 0.295123590471212, + "grad_norm": 0.5271091461181641, + "kl": 0.285888671875, + "learning_rate": 8.99597500729487e-07, + "loss": 0.04, + "reward": 1.1049107909202576, + "reward_std": 0.18517849408090115, + "rewards/accuracy_reward": 0.13616071734577417, + "rewards/format_reward": 0.9687500447034836, + "step": 988 + }, + { + "completion_length": 1190.3437805175781, + "epoch": 0.2954222985587335, + "grad_norm": 0.5468838810920715, + "kl": 0.27685546875, + "learning_rate": 8.993017797296458e-07, + "loss": 0.0289, + "reward": 1.1250000596046448, + "reward_std": 0.19768429547548294, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9553571790456772, + "step": 989 + }, + { + "completion_length": 1184.4018249511719, + "epoch": 0.29572100664625495, + "grad_norm": 1.2269253730773926, + "kl": 0.263427734375, + "learning_rate": 8.990056787237038e-07, + "loss": 0.0579, + "reward": 1.071428656578064, + "reward_std": 0.15646198857575655, + "rewards/accuracy_reward": 0.09598214388825, + "rewards/format_reward": 0.9754464626312256, + "step": 990 + }, + { + "completion_length": 1192.4308776855469, + "epoch": 0.2960197147337764, + "grad_norm": 0.7671657204627991, + "kl": 0.3251953125, + "learning_rate": 8.987091980337894e-07, + "loss": 0.0175, + "reward": 1.1339286267757416, + "reward_std": 0.24339015409350395, + "rewards/accuracy_reward": 0.17633929336443543, + "rewards/format_reward": 0.957589328289032, + "step": 991 + }, + { + "completion_length": 1270.7969360351562, + "epoch": 0.2963184228212979, + "grad_norm": 0.4855984151363373, + "kl": 0.42236328125, + "learning_rate": 8.984123379824448e-07, + "loss": 0.0418, + "reward": 1.1071429252624512, + "reward_std": 0.1829575002193451, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9776786118745804, + "step": 992 + }, + { + "completion_length": 1279.091552734375, + "epoch": 0.29661713090881936, + "grad_norm": 0.7516826391220093, + "kl": 0.303466796875, + "learning_rate": 8.981150988926246e-07, + "loss": 0.0446, + "reward": 1.0915178954601288, + "reward_std": 0.2372024618089199, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9598214775323868, + "step": 993 + }, + { + "completion_length": 1257.2656860351562, + "epoch": 0.29691583899634083, + "grad_norm": 0.9493581056594849, + "kl": 0.306640625, + "learning_rate": 8.978174810876958e-07, + "loss": 0.0252, + "reward": 1.1875000894069672, + "reward_std": 0.2213444747030735, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.973214328289032, + "step": 994 + }, + { + "completion_length": 1222.6384582519531, + "epoch": 0.2972145470838623, + "grad_norm": 0.6565985083580017, + "kl": 0.3291015625, + "learning_rate": 8.975194848914371e-07, + "loss": 0.0353, + "reward": 1.1227678954601288, + "reward_std": 0.16020704992115498, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.9821428954601288, + "step": 995 + }, + { + "completion_length": 1172.7232666015625, + "epoch": 0.2975132551713838, + "grad_norm": 0.6193428039550781, + "kl": 0.3720703125, + "learning_rate": 8.972211106280397e-07, + "loss": 0.01, + "reward": 1.1696429252624512, + "reward_std": 0.201943077147007, + "rewards/accuracy_reward": 0.2008928693830967, + "rewards/format_reward": 0.9687500447034836, + "step": 996 + }, + { + "completion_length": 1211.43310546875, + "epoch": 0.29781196325890524, + "grad_norm": 0.7058922052383423, + "kl": 0.41845703125, + "learning_rate": 8.96922358622105e-07, + "loss": 0.0344, + "reward": 1.1049107611179352, + "reward_std": 0.24070242047309875, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.9620536118745804, + "step": 997 + }, + { + "completion_length": 1245.7455749511719, + "epoch": 0.2981106713464267, + "grad_norm": 0.817310631275177, + "kl": 0.439453125, + "learning_rate": 8.966232291986462e-07, + "loss": 0.032, + "reward": 1.2053572237491608, + "reward_std": 0.1892482452094555, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.9754464775323868, + "step": 998 + }, + { + "completion_length": 1182.2366638183594, + "epoch": 0.2984093794339482, + "grad_norm": 0.7012380361557007, + "kl": 0.38134765625, + "learning_rate": 8.963237226830869e-07, + "loss": 0.0492, + "reward": 1.1495535969734192, + "reward_std": 0.23513554222881794, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9665178954601288, + "step": 999 + }, + { + "completion_length": 1201.9063110351562, + "epoch": 0.29870808752146966, + "grad_norm": 0.7501558065414429, + "kl": 0.368408203125, + "learning_rate": 8.960238394012607e-07, + "loss": 0.0215, + "reward": 1.1116071939468384, + "reward_std": 0.18770071677863598, + "rewards/accuracy_reward": 0.13839286309666932, + "rewards/format_reward": 0.9732143133878708, + "step": 1000 + }, + { + "completion_length": 1198.3906555175781, + "epoch": 0.2990067956089911, + "grad_norm": 0.5562548041343689, + "kl": 0.400390625, + "learning_rate": 8.957235796794111e-07, + "loss": 0.0307, + "reward": 1.1406250149011612, + "reward_std": 0.20256293565034866, + "rewards/accuracy_reward": 0.16741072502918541, + "rewards/format_reward": 0.973214328289032, + "step": 1001 + }, + { + "completion_length": 1212.5715026855469, + "epoch": 0.2993055036965126, + "grad_norm": 0.638306200504303, + "kl": 0.3740234375, + "learning_rate": 8.954229438441915e-07, + "loss": 0.0488, + "reward": 1.1071428954601288, + "reward_std": 0.1948051154613495, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 1002 + }, + { + "completion_length": 1169.841552734375, + "epoch": 0.29960421178403407, + "grad_norm": 1.201653242111206, + "kl": 0.34375, + "learning_rate": 8.951219322226638e-07, + "loss": 0.0501, + "reward": 1.1875000447034836, + "reward_std": 0.19830162078142166, + "rewards/accuracy_reward": 0.2142857238650322, + "rewards/format_reward": 0.973214328289032, + "step": 1003 + }, + { + "completion_length": 1056.7589874267578, + "epoch": 0.29990291987155554, + "grad_norm": 1.035495400428772, + "kl": 0.408203125, + "learning_rate": 8.948205451422996e-07, + "loss": 0.0468, + "reward": 1.0758928954601288, + "reward_std": 0.2172699235379696, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.9598214626312256, + "step": 1004 + }, + { + "completion_length": 1217.0045471191406, + "epoch": 0.300201627959077, + "grad_norm": 0.6532241106033325, + "kl": 0.4541015625, + "learning_rate": 8.945187829309784e-07, + "loss": 0.0339, + "reward": 1.1383929252624512, + "reward_std": 0.19286992028355598, + "rewards/accuracy_reward": 0.17187500675208867, + "rewards/format_reward": 0.96651791036129, + "step": 1005 + }, + { + "completion_length": 1125.6183471679688, + "epoch": 0.3005003360465985, + "grad_norm": 0.8186260461807251, + "kl": 0.3740234375, + "learning_rate": 8.942166459169879e-07, + "loss": 0.0028, + "reward": 1.0334821939468384, + "reward_std": 0.19595249369740486, + "rewards/accuracy_reward": 0.06919643189758062, + "rewards/format_reward": 0.964285746216774, + "step": 1006 + }, + { + "completion_length": 1092.9598693847656, + "epoch": 0.30079904413411995, + "grad_norm": 1.0821847915649414, + "kl": 0.38671875, + "learning_rate": 8.939141344290233e-07, + "loss": 0.0581, + "reward": 1.1294643580913544, + "reward_std": 0.2165052369236946, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.9732143133878708, + "step": 1007 + }, + { + "completion_length": 1163.6429443359375, + "epoch": 0.3010977522216414, + "grad_norm": 1.2476756572723389, + "kl": 0.51318359375, + "learning_rate": 8.936112487961877e-07, + "loss": 0.0512, + "reward": 1.1361607611179352, + "reward_std": 0.2274093497544527, + "rewards/accuracy_reward": 0.18303572572767735, + "rewards/format_reward": 0.9531250596046448, + "step": 1008 + }, + { + "completion_length": 1248.9576721191406, + "epoch": 0.3013964603091629, + "grad_norm": 0.6124497056007385, + "kl": 0.470703125, + "learning_rate": 8.933079893479911e-07, + "loss": 0.0234, + "reward": 1.1183036267757416, + "reward_std": 0.2836826853454113, + "rewards/accuracy_reward": 0.1584821455180645, + "rewards/format_reward": 0.9598214775323868, + "step": 1009 + }, + { + "completion_length": 1113.8572082519531, + "epoch": 0.30169516839668437, + "grad_norm": 0.6133726835250854, + "kl": 0.4365234375, + "learning_rate": 8.930043564143497e-07, + "loss": 0.043, + "reward": 1.1250000596046448, + "reward_std": 0.24296309426426888, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9575893431901932, + "step": 1010 + }, + { + "completion_length": 1186.4710388183594, + "epoch": 0.30199387648420584, + "grad_norm": 0.6840566992759705, + "kl": 0.408203125, + "learning_rate": 8.927003503255866e-07, + "loss": 0.036, + "reward": 1.0848214775323868, + "reward_std": 0.20400092005729675, + "rewards/accuracy_reward": 0.1250000095460564, + "rewards/format_reward": 0.9598214775323868, + "step": 1011 + }, + { + "completion_length": 1065.8080596923828, + "epoch": 0.30229258457172725, + "grad_norm": 0.8958324790000916, + "kl": 0.38525390625, + "learning_rate": 8.923959714124306e-07, + "loss": 0.0627, + "reward": 1.1562500894069672, + "reward_std": 0.23110471293330193, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9665178954601288, + "step": 1012 + }, + { + "completion_length": 1096.0067443847656, + "epoch": 0.3025912926592487, + "grad_norm": 0.8273324370384216, + "kl": 0.40576171875, + "learning_rate": 8.920912200060161e-07, + "loss": 0.0202, + "reward": 1.1227679252624512, + "reward_std": 0.2727979086339474, + "rewards/accuracy_reward": 0.1629464328289032, + "rewards/format_reward": 0.9598214775323868, + "step": 1013 + }, + { + "completion_length": 1120.2545166015625, + "epoch": 0.3028900007467702, + "grad_norm": 0.5702931880950928, + "kl": 0.53271484375, + "learning_rate": 8.917860964378829e-07, + "loss": 0.0666, + "reward": 1.1674107760190964, + "reward_std": 0.23417052812874317, + "rewards/accuracy_reward": 0.2098214440047741, + "rewards/format_reward": 0.957589328289032, + "step": 1014 + }, + { + "completion_length": 1104.1763916015625, + "epoch": 0.30318870883429166, + "grad_norm": 1.0165836811065674, + "kl": 0.509765625, + "learning_rate": 8.914806010399753e-07, + "loss": 0.0635, + "reward": 1.162946492433548, + "reward_std": 0.25959716737270355, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9687500447034836, + "step": 1015 + }, + { + "completion_length": 1113.2031860351562, + "epoch": 0.30348741692181314, + "grad_norm": 1.170801043510437, + "kl": 0.47412109375, + "learning_rate": 8.911747341446425e-07, + "loss": 0.0782, + "reward": 1.1651786267757416, + "reward_std": 0.22908079996705055, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9732143431901932, + "step": 1016 + }, + { + "completion_length": 1113.7277374267578, + "epoch": 0.3037861250093346, + "grad_norm": 0.6066778898239136, + "kl": 0.5166015625, + "learning_rate": 8.908684960846376e-07, + "loss": 0.0505, + "reward": 1.09151791036129, + "reward_std": 0.21521663945168257, + "rewards/accuracy_reward": 0.1294642873108387, + "rewards/format_reward": 0.9620536118745804, + "step": 1017 + }, + { + "completion_length": 1132.7522735595703, + "epoch": 0.3040848330968561, + "grad_norm": 0.6409362554550171, + "kl": 0.42333984375, + "learning_rate": 8.905618871931177e-07, + "loss": 0.0308, + "reward": 1.238839328289032, + "reward_std": 0.23071682453155518, + "rewards/accuracy_reward": 0.2834821529686451, + "rewards/format_reward": 0.9553571790456772, + "step": 1018 + }, + { + "completion_length": 1139.2121276855469, + "epoch": 0.30438354118437755, + "grad_norm": 2.7751688957214355, + "kl": 0.44873046875, + "learning_rate": 8.902549078036433e-07, + "loss": 0.0521, + "reward": 1.191964328289032, + "reward_std": 0.1565282940864563, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.973214328289032, + "step": 1019 + }, + { + "completion_length": 1150.7299499511719, + "epoch": 0.304682249271899, + "grad_norm": 0.5277963280677795, + "kl": 0.42333984375, + "learning_rate": 8.899475582501775e-07, + "loss": 0.039, + "reward": 1.1495536267757416, + "reward_std": 0.18294814601540565, + "rewards/accuracy_reward": 0.1763392984867096, + "rewards/format_reward": 0.9732143431901932, + "step": 1020 + }, + { + "completion_length": 1174.5045166015625, + "epoch": 0.3049809573594205, + "grad_norm": 0.6571467518806458, + "kl": 0.4541015625, + "learning_rate": 8.89639838867087e-07, + "loss": 0.0535, + "reward": 1.1339286267757416, + "reward_std": 0.23839029110968113, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9665178954601288, + "step": 1021 + }, + { + "completion_length": 1130.7701416015625, + "epoch": 0.30527966544694196, + "grad_norm": 0.8074598908424377, + "kl": 0.379638671875, + "learning_rate": 8.893317499891401e-07, + "loss": 0.0388, + "reward": 1.113839328289032, + "reward_std": 0.24112972617149353, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.973214328289032, + "step": 1022 + }, + { + "completion_length": 1113.6005096435547, + "epoch": 0.30557837353446343, + "grad_norm": 0.7473259568214417, + "kl": 0.4921875, + "learning_rate": 8.890232919515071e-07, + "loss": 0.0681, + "reward": 1.2075893580913544, + "reward_std": 0.21761994808912277, + "rewards/accuracy_reward": 0.2477678693830967, + "rewards/format_reward": 0.9598214775323868, + "step": 1023 + }, + { + "completion_length": 1084.3660888671875, + "epoch": 0.3058770816219849, + "grad_norm": 0.8395385146141052, + "kl": 0.39453125, + "learning_rate": 8.887144650897606e-07, + "loss": 0.0342, + "reward": 1.0736607313156128, + "reward_std": 0.2388089932501316, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.9709821790456772, + "step": 1024 + }, + { + "completion_length": 1232.3348999023438, + "epoch": 0.3061757897095064, + "grad_norm": 0.8391432762145996, + "kl": 0.41162109375, + "learning_rate": 8.884052697398735e-07, + "loss": 0.0511, + "reward": 1.1026786267757416, + "reward_std": 0.23479115217924118, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.957589328289032, + "step": 1025 + }, + { + "completion_length": 1027.839340209961, + "epoch": 0.30647449779702785, + "grad_norm": 0.698376476764679, + "kl": 0.4072265625, + "learning_rate": 8.880957062382202e-07, + "loss": 0.0269, + "reward": 1.189732164144516, + "reward_std": 0.21471177600324154, + "rewards/accuracy_reward": 0.2299107201397419, + "rewards/format_reward": 0.9598214477300644, + "step": 1026 + }, + { + "completion_length": 1142.5090026855469, + "epoch": 0.3067732058845493, + "grad_norm": 0.7509124279022217, + "kl": 0.34423828125, + "learning_rate": 8.877857749215755e-07, + "loss": 0.0653, + "reward": 1.0602678954601288, + "reward_std": 0.20407340675592422, + "rewards/accuracy_reward": 0.08705357694998384, + "rewards/format_reward": 0.973214328289032, + "step": 1027 + }, + { + "completion_length": 1146.8504943847656, + "epoch": 0.3070719139720708, + "grad_norm": 0.5915194153785706, + "kl": 0.265625, + "learning_rate": 8.874754761271142e-07, + "loss": 0.0186, + "reward": 1.1093750596046448, + "reward_std": 0.2314216997474432, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9620536118745804, + "step": 1028 + }, + { + "completion_length": 1209.2902221679688, + "epoch": 0.30737062205959226, + "grad_norm": 0.8450536727905273, + "kl": 0.322509765625, + "learning_rate": 8.871648101924109e-07, + "loss": 0.0729, + "reward": 1.0736607611179352, + "reward_std": 0.2249983288347721, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9598214775323868, + "step": 1029 + }, + { + "completion_length": 1120.0201416015625, + "epoch": 0.30766933014711373, + "grad_norm": 0.7520185708999634, + "kl": 0.36328125, + "learning_rate": 8.8685377745544e-07, + "loss": 0.0279, + "reward": 1.0290179252624512, + "reward_std": 0.15143464505672455, + "rewards/accuracy_reward": 0.06026785867288709, + "rewards/format_reward": 0.9687500298023224, + "step": 1030 + }, + { + "completion_length": 1113.5179138183594, + "epoch": 0.3079680382346352, + "grad_norm": 0.6455516815185547, + "kl": 0.2890625, + "learning_rate": 8.865423782545745e-07, + "loss": 0.0158, + "reward": 1.100446492433548, + "reward_std": 0.25217172130942345, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.9620536118745804, + "step": 1031 + }, + { + "completion_length": 1018.8728485107422, + "epoch": 0.30826674632215667, + "grad_norm": 0.6189129948616028, + "kl": 0.23046875, + "learning_rate": 8.86230612928586e-07, + "loss": 0.0299, + "reward": 1.2321428954601288, + "reward_std": 0.18216821178793907, + "rewards/accuracy_reward": 0.2455357313156128, + "rewards/format_reward": 0.9866071790456772, + "step": 1032 + }, + { + "completion_length": 1188.6920471191406, + "epoch": 0.30856545440967814, + "grad_norm": 0.6737827658653259, + "kl": 0.4111328125, + "learning_rate": 8.859184818166449e-07, + "loss": 0.0331, + "reward": 1.0803571939468384, + "reward_std": 0.2225414551794529, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.9620535969734192, + "step": 1033 + }, + { + "completion_length": 1075.7857818603516, + "epoch": 0.3088641624971996, + "grad_norm": 0.719916045665741, + "kl": 0.3291015625, + "learning_rate": 8.85605985258319e-07, + "loss": 0.0457, + "reward": 1.1540179252624512, + "reward_std": 0.23921366780996323, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.9665178954601288, + "step": 1034 + }, + { + "completion_length": 1078.513442993164, + "epoch": 0.3091628705847211, + "grad_norm": 0.5876836180686951, + "kl": 0.335205078125, + "learning_rate": 8.852931235935741e-07, + "loss": 0.0388, + "reward": 1.0803571939468384, + "reward_std": 0.15933088399469852, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.9732143133878708, + "step": 1035 + }, + { + "completion_length": 1114.7678833007812, + "epoch": 0.30946157867224255, + "grad_norm": 0.496092289686203, + "kl": 0.356201171875, + "learning_rate": 8.849798971627731e-07, + "loss": 0.0559, + "reward": 1.0937500596046448, + "reward_std": 0.267391175031662, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.9419643431901932, + "step": 1036 + }, + { + "completion_length": 1085.3326416015625, + "epoch": 0.309760286759764, + "grad_norm": 1.0572519302368164, + "kl": 0.39794921875, + "learning_rate": 8.846663063066754e-07, + "loss": 0.0674, + "reward": 1.1808036267757416, + "reward_std": 0.24029014632105827, + "rewards/accuracy_reward": 0.2254464328289032, + "rewards/format_reward": 0.9553571939468384, + "step": 1037 + }, + { + "completion_length": 1027.7589569091797, + "epoch": 0.3100589948472855, + "grad_norm": 0.473903626203537, + "kl": 0.33203125, + "learning_rate": 8.843523513664373e-07, + "loss": 0.0344, + "reward": 1.1607142984867096, + "reward_std": 0.20169370993971825, + "rewards/accuracy_reward": 0.1808035746216774, + "rewards/format_reward": 0.979910746216774, + "step": 1038 + }, + { + "completion_length": 1046.5893249511719, + "epoch": 0.31035770293480697, + "grad_norm": 0.5080247521400452, + "kl": 0.40673828125, + "learning_rate": 8.840380326836111e-07, + "loss": 0.035, + "reward": 1.1205357909202576, + "reward_std": 0.24646694213151932, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9620536118745804, + "step": 1039 + }, + { + "completion_length": 1062.2076416015625, + "epoch": 0.31065641102232844, + "grad_norm": 1.0752677917480469, + "kl": 0.5625, + "learning_rate": 8.837233506001443e-07, + "loss": 0.1033, + "reward": 1.1383928954601288, + "reward_std": 0.3105136752128601, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9375000298023224, + "step": 1040 + }, + { + "completion_length": 1025.7255096435547, + "epoch": 0.3109551191098499, + "grad_norm": 1.4036407470703125, + "kl": 0.6171875, + "learning_rate": 8.834083054583807e-07, + "loss": 0.0531, + "reward": 1.0156250596046448, + "reward_std": 0.22460739687085152, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.9553571790456772, + "step": 1041 + }, + { + "completion_length": 1034.7857513427734, + "epoch": 0.3112538271973714, + "grad_norm": 1.0202795267105103, + "kl": 0.5048828125, + "learning_rate": 8.830928976010581e-07, + "loss": 0.0621, + "reward": 1.1361607313156128, + "reward_std": 0.14363107457756996, + "rewards/accuracy_reward": 0.15848215157166123, + "rewards/format_reward": 0.9776786118745804, + "step": 1042 + }, + { + "completion_length": 1063.7054138183594, + "epoch": 0.31155253528489285, + "grad_norm": 1.1113314628601074, + "kl": 0.59521484375, + "learning_rate": 8.827771273713097e-07, + "loss": 0.0318, + "reward": 1.0803572088479996, + "reward_std": 0.21820153668522835, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.9553571790456772, + "step": 1043 + }, + { + "completion_length": 1063.964340209961, + "epoch": 0.3118512433724143, + "grad_norm": 1.18477463722229, + "kl": 0.54296875, + "learning_rate": 8.824609951126624e-07, + "loss": 0.0527, + "reward": 1.1026786118745804, + "reward_std": 0.18148761987686157, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9732143133878708, + "step": 1044 + }, + { + "completion_length": 1157.8125305175781, + "epoch": 0.3121499514599358, + "grad_norm": 1.12227463722229, + "kl": 0.55517578125, + "learning_rate": 8.821445011690369e-07, + "loss": 0.0396, + "reward": 1.0602679252624512, + "reward_std": 0.19020790234208107, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.9620535969734192, + "step": 1045 + }, + { + "completion_length": 1130.2857971191406, + "epoch": 0.31244865954745726, + "grad_norm": 1.5454877614974976, + "kl": 0.5625, + "learning_rate": 8.81827645884748e-07, + "loss": 0.0585, + "reward": 1.1093750596046448, + "reward_std": 0.19970792531967163, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.973214328289032, + "step": 1046 + }, + { + "completion_length": 1173.2500610351562, + "epoch": 0.31274736763497873, + "grad_norm": 0.8325404524803162, + "kl": 0.43310546875, + "learning_rate": 8.815104296045028e-07, + "loss": 0.0494, + "reward": 1.1227678954601288, + "reward_std": 0.176397655159235, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9709821939468384, + "step": 1047 + }, + { + "completion_length": 1114.9107666015625, + "epoch": 0.3130460757225002, + "grad_norm": 0.7607965469360352, + "kl": 0.354248046875, + "learning_rate": 8.811928526734019e-07, + "loss": 0.0613, + "reward": 1.176339328289032, + "reward_std": 0.236023161560297, + "rewards/accuracy_reward": 0.2098214440047741, + "rewards/format_reward": 0.9665178954601288, + "step": 1048 + }, + { + "completion_length": 1155.8370971679688, + "epoch": 0.3133447838100217, + "grad_norm": 0.3752785325050354, + "kl": 0.335205078125, + "learning_rate": 8.808749154369376e-07, + "loss": 0.0405, + "reward": 1.1495536267757416, + "reward_std": 0.20814339816570282, + "rewards/accuracy_reward": 0.17410715110599995, + "rewards/format_reward": 0.9754464775323868, + "step": 1049 + }, + { + "completion_length": 1137.966552734375, + "epoch": 0.31364349189754315, + "grad_norm": 0.5219810605049133, + "kl": 0.253662109375, + "learning_rate": 8.805566182409945e-07, + "loss": 0.0414, + "reward": 1.1183036267757416, + "reward_std": 0.17048955522477627, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9776786267757416, + "step": 1050 + }, + { + "completion_length": 1148.0067443847656, + "epoch": 0.3139421999850646, + "grad_norm": 0.5076432228088379, + "kl": 0.235595703125, + "learning_rate": 8.802379614318486e-07, + "loss": 0.0249, + "reward": 1.1741071939468384, + "reward_std": 0.25768617913126945, + "rewards/accuracy_reward": 0.2209821566939354, + "rewards/format_reward": 0.9531250447034836, + "step": 1051 + }, + { + "completion_length": 1177.2299499511719, + "epoch": 0.3142409080725861, + "grad_norm": 1.2411575317382812, + "kl": 0.304931640625, + "learning_rate": 8.799189453561679e-07, + "loss": 0.04, + "reward": 1.1071429252624512, + "reward_std": 0.22162554040551186, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9687500298023224, + "step": 1052 + }, + { + "completion_length": 1171.9777221679688, + "epoch": 0.31453961616010756, + "grad_norm": 0.44202178716659546, + "kl": 0.205322265625, + "learning_rate": 8.795995703610097e-07, + "loss": 0.0144, + "reward": 1.254464328289032, + "reward_std": 0.20248723030090332, + "rewards/accuracy_reward": 0.2924107313156128, + "rewards/format_reward": 0.9620535969734192, + "step": 1053 + }, + { + "completion_length": 1289.5111999511719, + "epoch": 0.31483832424762903, + "grad_norm": 0.8114349842071533, + "kl": 0.29052734375, + "learning_rate": 8.792798367938234e-07, + "loss": 0.0246, + "reward": 1.0781250447034836, + "reward_std": 0.2354605495929718, + "rewards/accuracy_reward": 0.12946428963914514, + "rewards/format_reward": 0.9486607611179352, + "step": 1054 + }, + { + "completion_length": 1194.7589721679688, + "epoch": 0.31513703233515045, + "grad_norm": 0.7534329891204834, + "kl": 0.19775390625, + "learning_rate": 8.789597450024478e-07, + "loss": 0.006, + "reward": 1.10714291036129, + "reward_std": 0.2686406075954437, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 1055 + }, + { + "completion_length": 1233.3527221679688, + "epoch": 0.3154357404226719, + "grad_norm": 1.3216562271118164, + "kl": 0.30517578125, + "learning_rate": 8.786392953351109e-07, + "loss": 0.0273, + "reward": 1.1227678954601288, + "reward_std": 0.27944479137659073, + "rewards/accuracy_reward": 0.16294643469154835, + "rewards/format_reward": 0.9598214626312256, + "step": 1056 + }, + { + "completion_length": 1158.7255249023438, + "epoch": 0.3157344485101934, + "grad_norm": 0.37715375423431396, + "kl": 0.34375, + "learning_rate": 8.783184881404307e-07, + "loss": 0.0392, + "reward": 1.0714286416769028, + "reward_std": 0.195925734937191, + "rewards/accuracy_reward": 0.09821429057046771, + "rewards/format_reward": 0.973214328289032, + "step": 1057 + }, + { + "completion_length": 1124.4308776855469, + "epoch": 0.31603315659771486, + "grad_norm": 0.6918757557868958, + "kl": 0.35205078125, + "learning_rate": 8.779973237674141e-07, + "loss": -0.0025, + "reward": 1.04464291036129, + "reward_std": 0.23221349716186523, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.9598214775323868, + "step": 1058 + }, + { + "completion_length": 1131.8036499023438, + "epoch": 0.31633186468523633, + "grad_norm": 0.5812383890151978, + "kl": 0.324462890625, + "learning_rate": 8.776758025654566e-07, + "loss": 0.0121, + "reward": 1.1227679252624512, + "reward_std": 0.19593237154185772, + "rewards/accuracy_reward": 0.16517857555299997, + "rewards/format_reward": 0.957589328289032, + "step": 1059 + }, + { + "completion_length": 1148.4107666015625, + "epoch": 0.3166305727727578, + "grad_norm": 0.6157229542732239, + "kl": 0.277587890625, + "learning_rate": 8.773539248843416e-07, + "loss": 0.018, + "reward": 1.0580357611179352, + "reward_std": 0.2206891067326069, + "rewards/accuracy_reward": 0.09151786100119352, + "rewards/format_reward": 0.96651791036129, + "step": 1060 + }, + { + "completion_length": 1067.7768249511719, + "epoch": 0.31692928086027927, + "grad_norm": 0.6751936674118042, + "kl": 0.29736328125, + "learning_rate": 8.770316910742403e-07, + "loss": -0.0174, + "reward": 1.1785714626312256, + "reward_std": 0.19342003390192986, + "rewards/accuracy_reward": 0.191964291036129, + "rewards/format_reward": 0.9866071790456772, + "step": 1061 + }, + { + "completion_length": 1134.5134582519531, + "epoch": 0.31722798894780074, + "grad_norm": 0.49552395939826965, + "kl": 0.32763671875, + "learning_rate": 8.767091014857118e-07, + "loss": -0.0045, + "reward": 1.1227678954601288, + "reward_std": 0.20780370011925697, + "rewards/accuracy_reward": 0.14285715040750802, + "rewards/format_reward": 0.979910746216774, + "step": 1062 + }, + { + "completion_length": 1097.5558624267578, + "epoch": 0.3175266970353222, + "grad_norm": 0.5569441914558411, + "kl": 0.33056640625, + "learning_rate": 8.763861564697017e-07, + "loss": 0.0269, + "reward": 1.1183036118745804, + "reward_std": 0.15407376922667027, + "rewards/accuracy_reward": 0.14732143771834671, + "rewards/format_reward": 0.9709821790456772, + "step": 1063 + }, + { + "completion_length": 1080.5089569091797, + "epoch": 0.3178254051228437, + "grad_norm": 0.5801836252212524, + "kl": 0.34619140625, + "learning_rate": 8.760628563775426e-07, + "loss": 0.0176, + "reward": 1.1875000596046448, + "reward_std": 0.22271496430039406, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9843750447034836, + "step": 1064 + }, + { + "completion_length": 1159.7121276855469, + "epoch": 0.31812411321036516, + "grad_norm": 0.9570164680480957, + "kl": 0.2509765625, + "learning_rate": 8.757392015609536e-07, + "loss": 0.0392, + "reward": 1.142857164144516, + "reward_std": 0.2408793307840824, + "rewards/accuracy_reward": 0.17410714738070965, + "rewards/format_reward": 0.9687500298023224, + "step": 1065 + }, + { + "completion_length": 1054.0781707763672, + "epoch": 0.3184228212978866, + "grad_norm": 0.5561630725860596, + "kl": 0.286865234375, + "learning_rate": 8.754151923720389e-07, + "loss": 0.0176, + "reward": 1.158482164144516, + "reward_std": 0.2538502924144268, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9776786118745804, + "step": 1066 + }, + { + "completion_length": 1089.6741638183594, + "epoch": 0.3187215293854081, + "grad_norm": 0.41451767086982727, + "kl": 0.3798828125, + "learning_rate": 8.750908291632893e-07, + "loss": 0.0269, + "reward": 1.116071492433548, + "reward_std": 0.15279990248382092, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9776786267757416, + "step": 1067 + }, + { + "completion_length": 1098.3549499511719, + "epoch": 0.31902023747292957, + "grad_norm": 0.4423317015171051, + "kl": 0.300537109375, + "learning_rate": 8.747661122875796e-07, + "loss": 0.0225, + "reward": 1.160714328289032, + "reward_std": 0.1732000894844532, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9754464626312256, + "step": 1068 + }, + { + "completion_length": 1047.450927734375, + "epoch": 0.31931894556045104, + "grad_norm": 0.5230085253715515, + "kl": 0.304443359375, + "learning_rate": 8.744410420981703e-07, + "loss": 0.0348, + "reward": 1.2477678954601288, + "reward_std": 0.2695121858268976, + "rewards/accuracy_reward": 0.2678571566939354, + "rewards/format_reward": 0.979910746216774, + "step": 1069 + }, + { + "completion_length": 1078.7210235595703, + "epoch": 0.3196176536479725, + "grad_norm": 0.5267232656478882, + "kl": 0.28369140625, + "learning_rate": 8.741156189487058e-07, + "loss": 0.0205, + "reward": 1.1674107909202576, + "reward_std": 0.181284686550498, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9709821790456772, + "step": 1070 + }, + { + "completion_length": 1056.0714874267578, + "epoch": 0.319916361735494, + "grad_norm": 0.6231043934822083, + "kl": 0.27490234375, + "learning_rate": 8.737898431932149e-07, + "loss": 0.0041, + "reward": 1.1875000596046448, + "reward_std": 0.15360806230455637, + "rewards/accuracy_reward": 0.2031250037252903, + "rewards/format_reward": 0.9843750596046448, + "step": 1071 + }, + { + "completion_length": 1063.4464874267578, + "epoch": 0.32021506982301545, + "grad_norm": 0.596026599407196, + "kl": 0.337890625, + "learning_rate": 8.734637151861093e-07, + "loss": 0.0203, + "reward": 1.1116072237491608, + "reward_std": 0.15510328486561775, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9843750596046448, + "step": 1072 + }, + { + "completion_length": 1070.7098693847656, + "epoch": 0.3205137779105369, + "grad_norm": 0.9417048692703247, + "kl": 0.295654296875, + "learning_rate": 8.731372352821843e-07, + "loss": 0.0596, + "reward": 1.1093750894069672, + "reward_std": 0.19707505032420158, + "rewards/accuracy_reward": 0.14508929662406445, + "rewards/format_reward": 0.964285746216774, + "step": 1073 + }, + { + "completion_length": 1093.6830749511719, + "epoch": 0.3208124859980584, + "grad_norm": 0.35108885169029236, + "kl": 0.257568359375, + "learning_rate": 8.728104038366182e-07, + "loss": 0.0152, + "reward": 1.129464328289032, + "reward_std": 0.18022939935326576, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9843750596046448, + "step": 1074 + }, + { + "completion_length": 1112.41748046875, + "epoch": 0.32111119408557987, + "grad_norm": 0.6268051266670227, + "kl": 0.35400390625, + "learning_rate": 8.724832212049716e-07, + "loss": 0.0035, + "reward": 1.162946492433548, + "reward_std": 0.19726943969726562, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.9754464626312256, + "step": 1075 + }, + { + "completion_length": 1147.8951416015625, + "epoch": 0.32140990217310134, + "grad_norm": 0.5676958560943604, + "kl": 0.345703125, + "learning_rate": 8.721556877431871e-07, + "loss": 0.0109, + "reward": 1.1049107611179352, + "reward_std": 0.256779495626688, + "rewards/accuracy_reward": 0.15625000838190317, + "rewards/format_reward": 0.948660746216774, + "step": 1076 + }, + { + "completion_length": 1099.2590026855469, + "epoch": 0.3217086102606228, + "grad_norm": 0.6145049929618835, + "kl": 0.406494140625, + "learning_rate": 8.718278038075891e-07, + "loss": 0.0793, + "reward": 1.0290178805589676, + "reward_std": 0.17831411957740784, + "rewards/accuracy_reward": 0.06250000209547579, + "rewards/format_reward": 0.9665178954601288, + "step": 1077 + }, + { + "completion_length": 1112.5402526855469, + "epoch": 0.3220073183481443, + "grad_norm": 0.6819204092025757, + "kl": 0.305908203125, + "learning_rate": 8.714995697548828e-07, + "loss": 0.0337, + "reward": 1.1986607611179352, + "reward_std": 0.26844048127532005, + "rewards/accuracy_reward": 0.23660715855658054, + "rewards/format_reward": 0.9620535969734192, + "step": 1078 + }, + { + "completion_length": 1116.8348693847656, + "epoch": 0.32230602643566575, + "grad_norm": 0.7898496985435486, + "kl": 0.3134765625, + "learning_rate": 8.711709859421551e-07, + "loss": 0.0065, + "reward": 1.2455357909202576, + "reward_std": 0.231604166328907, + "rewards/accuracy_reward": 0.2767857238650322, + "rewards/format_reward": 0.9687500447034836, + "step": 1079 + }, + { + "completion_length": 1065.558090209961, + "epoch": 0.3226047345231872, + "grad_norm": 0.8362342715263367, + "kl": 0.34228515625, + "learning_rate": 8.708420527268728e-07, + "loss": 0.0475, + "reward": 1.1696429252624512, + "reward_std": 0.2578626163303852, + "rewards/accuracy_reward": 0.21205358020961285, + "rewards/format_reward": 0.957589328289032, + "step": 1080 + }, + { + "completion_length": 1143.4732666015625, + "epoch": 0.3229034426107087, + "grad_norm": 0.5187474489212036, + "kl": 0.3232421875, + "learning_rate": 8.705127704668831e-07, + "loss": 0.0077, + "reward": 1.1026786267757416, + "reward_std": 0.17343294993042946, + "rewards/accuracy_reward": 0.12500000605359674, + "rewards/format_reward": 0.9776786118745804, + "step": 1081 + }, + { + "completion_length": 1115.4442291259766, + "epoch": 0.32320215069823016, + "grad_norm": 0.4269048869609833, + "kl": 0.3232421875, + "learning_rate": 8.701831395204127e-07, + "loss": 0.0145, + "reward": 1.2187500596046448, + "reward_std": 0.16921719256788492, + "rewards/accuracy_reward": 0.2410714365541935, + "rewards/format_reward": 0.9776785969734192, + "step": 1082 + }, + { + "completion_length": 1146.2054443359375, + "epoch": 0.32350085878575163, + "grad_norm": 0.7109887003898621, + "kl": 0.332763671875, + "learning_rate": 8.698531602460679e-07, + "loss": 0.0365, + "reward": 1.1183036267757416, + "reward_std": 0.2062375247478485, + "rewards/accuracy_reward": 0.1629464328289032, + "rewards/format_reward": 0.9553571790456772, + "step": 1083 + }, + { + "completion_length": 1086.6897430419922, + "epoch": 0.3237995668732731, + "grad_norm": 0.5959547758102417, + "kl": 0.515625, + "learning_rate": 8.695228330028336e-07, + "loss": 0.0286, + "reward": 1.1026786118745804, + "reward_std": 0.22786926478147507, + "rewards/accuracy_reward": 0.13839286752045155, + "rewards/format_reward": 0.9642857611179352, + "step": 1084 + }, + { + "completion_length": 1088.950942993164, + "epoch": 0.3240982749607946, + "grad_norm": 0.5634734034538269, + "kl": 0.3955078125, + "learning_rate": 8.691921581500735e-07, + "loss": 0.0243, + "reward": 1.0870536416769028, + "reward_std": 0.2319803684949875, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.9620536118745804, + "step": 1085 + }, + { + "completion_length": 993.1696929931641, + "epoch": 0.32439698304831605, + "grad_norm": 0.7903431057929993, + "kl": 0.4345703125, + "learning_rate": 8.688611360475298e-07, + "loss": 0.0392, + "reward": 1.1339285969734192, + "reward_std": 0.2671186998486519, + "rewards/accuracy_reward": 0.16741072107106447, + "rewards/format_reward": 0.9665178954601288, + "step": 1086 + }, + { + "completion_length": 1142.6741333007812, + "epoch": 0.3246956911358375, + "grad_norm": 0.6598278880119324, + "kl": 0.466796875, + "learning_rate": 8.685297670553217e-07, + "loss": 0.0215, + "reward": 1.1160714775323868, + "reward_std": 0.20647890120744705, + "rewards/accuracy_reward": 0.14955357578583062, + "rewards/format_reward": 0.9665178954601288, + "step": 1087 + }, + { + "completion_length": 1152.9308471679688, + "epoch": 0.324994399223359, + "grad_norm": 0.8383154273033142, + "kl": 0.380859375, + "learning_rate": 8.681980515339463e-07, + "loss": 0.0372, + "reward": 1.145089328289032, + "reward_std": 0.21987586095929146, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9754464626312256, + "step": 1088 + }, + { + "completion_length": 1058.2053985595703, + "epoch": 0.32529310731088046, + "grad_norm": 0.8379929661750793, + "kl": 0.4765625, + "learning_rate": 8.678659898442776e-07, + "loss": 0.0763, + "reward": 1.1696428954601288, + "reward_std": 0.23817084450274706, + "rewards/accuracy_reward": 0.21651786752045155, + "rewards/format_reward": 0.9531250596046448, + "step": 1089 + }, + { + "completion_length": 1197.9531555175781, + "epoch": 0.32559181539840193, + "grad_norm": 0.8302472233772278, + "kl": 0.44921875, + "learning_rate": 8.675335823475662e-07, + "loss": 0.0467, + "reward": 1.0691964626312256, + "reward_std": 0.21472956985235214, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.964285746216774, + "step": 1090 + }, + { + "completion_length": 1084.6027526855469, + "epoch": 0.3258905234859234, + "grad_norm": 0.9936146140098572, + "kl": 0.51953125, + "learning_rate": 8.67200829405439e-07, + "loss": 0.0196, + "reward": 1.095982164144516, + "reward_std": 0.2438303604722023, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9665178954601288, + "step": 1091 + }, + { + "completion_length": 1118.622802734375, + "epoch": 0.32618923157344487, + "grad_norm": 1.0651682615280151, + "kl": 0.5439453125, + "learning_rate": 8.668677313798981e-07, + "loss": 0.0413, + "reward": 1.1294643580913544, + "reward_std": 0.3065907843410969, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.9553571790456772, + "step": 1092 + }, + { + "completion_length": 1164.0848846435547, + "epoch": 0.32648793966096634, + "grad_norm": 1.006779432296753, + "kl": 0.478515625, + "learning_rate": 8.66534288633322e-07, + "loss": 0.0365, + "reward": 1.0267857611179352, + "reward_std": 0.2844923809170723, + "rewards/accuracy_reward": 0.07589285913854837, + "rewards/format_reward": 0.9508928954601288, + "step": 1093 + }, + { + "completion_length": 1075.2679138183594, + "epoch": 0.3267866477484878, + "grad_norm": 1.4392684698104858, + "kl": 0.48681640625, + "learning_rate": 8.662005015284637e-07, + "loss": 0.0449, + "reward": 1.1495536267757416, + "reward_std": 0.18903416395187378, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.9598214626312256, + "step": 1094 + }, + { + "completion_length": 1140.8772888183594, + "epoch": 0.3270853558360093, + "grad_norm": 0.5801920890808105, + "kl": 0.44775390625, + "learning_rate": 8.658663704284505e-07, + "loss": 0.036, + "reward": 1.1651786267757416, + "reward_std": 0.13746514357626438, + "rewards/accuracy_reward": 0.1718750037252903, + "rewards/format_reward": 0.9933035969734192, + "step": 1095 + }, + { + "completion_length": 1171.4308319091797, + "epoch": 0.32738406392353075, + "grad_norm": 0.8576582074165344, + "kl": 0.49853515625, + "learning_rate": 8.655318956967845e-07, + "loss": 0.019, + "reward": 1.2187500596046448, + "reward_std": 0.2119695171713829, + "rewards/accuracy_reward": 0.2477678693830967, + "rewards/format_reward": 0.9709821790456772, + "step": 1096 + }, + { + "completion_length": 1136.21435546875, + "epoch": 0.3276827720110522, + "grad_norm": 0.9733317494392395, + "kl": 0.5302734375, + "learning_rate": 8.651970776973417e-07, + "loss": 0.0124, + "reward": 1.0937500596046448, + "reward_std": 0.19833021704107523, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.9843750596046448, + "step": 1097 + }, + { + "completion_length": 1148.9665832519531, + "epoch": 0.32798148009857364, + "grad_norm": 1.0382323265075684, + "kl": 0.5107421875, + "learning_rate": 8.648619167943706e-07, + "loss": 0.0331, + "reward": 1.2120536267757416, + "reward_std": 0.1716722622513771, + "rewards/accuracy_reward": 0.2388393022119999, + "rewards/format_reward": 0.9732143133878708, + "step": 1098 + }, + { + "completion_length": 1140.2634582519531, + "epoch": 0.3282801881860951, + "grad_norm": 0.8290655612945557, + "kl": 0.47998046875, + "learning_rate": 8.645264133524942e-07, + "loss": 0.0149, + "reward": 1.098214328289032, + "reward_std": 0.22474259696900845, + "rewards/accuracy_reward": 0.13392857764847577, + "rewards/format_reward": 0.9642857611179352, + "step": 1099 + }, + { + "completion_length": 1211.0536193847656, + "epoch": 0.3285788962736166, + "grad_norm": 0.49321070313453674, + "kl": 0.41015625, + "learning_rate": 8.641905677367066e-07, + "loss": 0.0015, + "reward": 1.1540179252624512, + "reward_std": 0.180099256336689, + "rewards/accuracy_reward": 0.16964286286383867, + "rewards/format_reward": 0.9843750447034836, + "step": 1100 + }, + { + "completion_length": 1102.6161499023438, + "epoch": 0.32887760436113805, + "grad_norm": 0.9113833904266357, + "kl": 0.3720703125, + "learning_rate": 8.638543803123756e-07, + "loss": 0.0375, + "reward": 1.0915179252624512, + "reward_std": 0.16486643254756927, + "rewards/accuracy_reward": 0.1205357164144516, + "rewards/format_reward": 0.9709821790456772, + "step": 1101 + }, + { + "completion_length": 1215.77685546875, + "epoch": 0.3291763124486595, + "grad_norm": 0.5948262810707092, + "kl": 0.35009765625, + "learning_rate": 8.635178514452397e-07, + "loss": 0.0236, + "reward": 1.0781250447034836, + "reward_std": 0.15818801894783974, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.9754464775323868, + "step": 1102 + }, + { + "completion_length": 1194.5357666015625, + "epoch": 0.329475020536181, + "grad_norm": 0.6529059410095215, + "kl": 0.3134765625, + "learning_rate": 8.631809815014095e-07, + "loss": 0.0217, + "reward": 1.0647322088479996, + "reward_std": 0.1881876289844513, + "rewards/accuracy_reward": 0.10491072060540318, + "rewards/format_reward": 0.9598214775323868, + "step": 1103 + }, + { + "completion_length": 1141.0870666503906, + "epoch": 0.32977372862370247, + "grad_norm": 1.197441816329956, + "kl": 0.33203125, + "learning_rate": 8.628437708473664e-07, + "loss": 0.0598, + "reward": 1.069196492433548, + "reward_std": 0.1891921516507864, + "rewards/accuracy_reward": 0.10044643515720963, + "rewards/format_reward": 0.9687500447034836, + "step": 1104 + }, + { + "completion_length": 1269.7634582519531, + "epoch": 0.33007243671122394, + "grad_norm": 1.1162018775939941, + "kl": 0.322265625, + "learning_rate": 8.625062198499627e-07, + "loss": 0.0345, + "reward": 1.1406250298023224, + "reward_std": 0.22402751818299294, + "rewards/accuracy_reward": 0.17410715017467737, + "rewards/format_reward": 0.96651791036129, + "step": 1105 + }, + { + "completion_length": 1209.21435546875, + "epoch": 0.3303711447987454, + "grad_norm": 0.7275714874267578, + "kl": 0.32177734375, + "learning_rate": 8.621683288764207e-07, + "loss": 0.0396, + "reward": 1.147321492433548, + "reward_std": 0.18005516566336155, + "rewards/accuracy_reward": 0.17633929662406445, + "rewards/format_reward": 0.9709821939468384, + "step": 1106 + }, + { + "completion_length": 1128.1652526855469, + "epoch": 0.3306698528862669, + "grad_norm": 0.5465099215507507, + "kl": 0.266357421875, + "learning_rate": 8.618300982943327e-07, + "loss": 0.008, + "reward": 1.2366071939468384, + "reward_std": 0.18844716250896454, + "rewards/accuracy_reward": 0.25669644214212894, + "rewards/format_reward": 0.9799107611179352, + "step": 1107 + }, + { + "completion_length": 1092.0402221679688, + "epoch": 0.33096856097378835, + "grad_norm": 0.5767174959182739, + "kl": 0.319580078125, + "learning_rate": 8.614915284716603e-07, + "loss": 0.0263, + "reward": 1.1495536267757416, + "reward_std": 0.15921075735241175, + "rewards/accuracy_reward": 0.17187500931322575, + "rewards/format_reward": 0.9776786118745804, + "step": 1108 + }, + { + "completion_length": 1055.2076263427734, + "epoch": 0.3312672690613098, + "grad_norm": 0.6815938353538513, + "kl": 0.28662109375, + "learning_rate": 8.611526197767346e-07, + "loss": 0.0124, + "reward": 1.1964286267757416, + "reward_std": 0.15356609597802162, + "rewards/accuracy_reward": 0.20758928917348385, + "rewards/format_reward": 0.9888393133878708, + "step": 1109 + }, + { + "completion_length": 1058.841552734375, + "epoch": 0.3315659771488313, + "grad_norm": 0.653323233127594, + "kl": 0.44091796875, + "learning_rate": 8.608133725782545e-07, + "loss": 0.0574, + "reward": 1.0468750596046448, + "reward_std": 0.28262215107679367, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.941964328289032, + "step": 1110 + }, + { + "completion_length": 1162.2991943359375, + "epoch": 0.33186468523635276, + "grad_norm": 0.8391473889350891, + "kl": 0.470703125, + "learning_rate": 8.604737872452881e-07, + "loss": 0.0267, + "reward": 1.1495535969734192, + "reward_std": 0.17156894505023956, + "rewards/accuracy_reward": 0.18750000605359674, + "rewards/format_reward": 0.9620536118745804, + "step": 1111 + }, + { + "completion_length": 1068.1853485107422, + "epoch": 0.33216339332387423, + "grad_norm": 1.1230158805847168, + "kl": 0.4150390625, + "learning_rate": 8.601338641472709e-07, + "loss": 0.0336, + "reward": 1.2142857909202576, + "reward_std": 0.27570317313075066, + "rewards/accuracy_reward": 0.2589285895228386, + "rewards/format_reward": 0.9553571790456772, + "step": 1112 + }, + { + "completion_length": 1109.1004943847656, + "epoch": 0.3324621014113957, + "grad_norm": 0.5754477977752686, + "kl": 0.3720703125, + "learning_rate": 8.597936036540061e-07, + "loss": 0.0254, + "reward": 1.0736607611179352, + "reward_std": 0.1653910744935274, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.9709821790456772, + "step": 1113 + }, + { + "completion_length": 1107.6473846435547, + "epoch": 0.3327608094989172, + "grad_norm": 1.0289397239685059, + "kl": 0.30126953125, + "learning_rate": 8.594530061356633e-07, + "loss": 0.0392, + "reward": 1.0424107909202576, + "reward_std": 0.23306577280163765, + "rewards/accuracy_reward": 0.07812500279396772, + "rewards/format_reward": 0.964285746216774, + "step": 1114 + }, + { + "completion_length": 1109.6116333007812, + "epoch": 0.33305951758643865, + "grad_norm": 0.45993903279304504, + "kl": 0.3525390625, + "learning_rate": 8.591120719627796e-07, + "loss": 0.0323, + "reward": 1.145089328289032, + "reward_std": 0.2324003092944622, + "rewards/accuracy_reward": 0.17857143748551607, + "rewards/format_reward": 0.9665178954601288, + "step": 1115 + }, + { + "completion_length": 1068.2991485595703, + "epoch": 0.3333582256739601, + "grad_norm": 0.5991848111152649, + "kl": 0.283203125, + "learning_rate": 8.587708015062578e-07, + "loss": 0.0087, + "reward": 1.2924107611179352, + "reward_std": 0.2458600103855133, + "rewards/accuracy_reward": 0.3325892984867096, + "rewards/format_reward": 0.9598214626312256, + "step": 1116 + }, + { + "completion_length": 1175.5469360351562, + "epoch": 0.3336569337614816, + "grad_norm": 0.9410957098007202, + "kl": 0.37353515625, + "learning_rate": 8.584291951373668e-07, + "loss": 0.0686, + "reward": 1.0558036267757416, + "reward_std": 0.27285540848970413, + "rewards/accuracy_reward": 0.10267857508733869, + "rewards/format_reward": 0.9531250447034836, + "step": 1117 + }, + { + "completion_length": 1067.8928985595703, + "epoch": 0.33395564184900306, + "grad_norm": 0.47335943579673767, + "kl": 0.341796875, + "learning_rate": 8.580872532277407e-07, + "loss": 0.0438, + "reward": 1.116071492433548, + "reward_std": 0.1756242709234357, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.973214328289032, + "step": 1118 + }, + { + "completion_length": 1084.3415832519531, + "epoch": 0.33425434993652453, + "grad_norm": 0.45412638783454895, + "kl": 0.2666015625, + "learning_rate": 8.57744976149379e-07, + "loss": 0.0021, + "reward": 1.06026791036129, + "reward_std": 0.15859811753034592, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.9709821790456772, + "step": 1119 + }, + { + "completion_length": 1175.2879943847656, + "epoch": 0.334553058024046, + "grad_norm": 0.8683106899261475, + "kl": 0.392578125, + "learning_rate": 8.574023642746455e-07, + "loss": 0.0702, + "reward": 1.0781250298023224, + "reward_std": 0.23303679376840591, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9553571790456772, + "step": 1120 + }, + { + "completion_length": 1098.497817993164, + "epoch": 0.3348517661115675, + "grad_norm": 0.8478466868400574, + "kl": 0.34912109375, + "learning_rate": 8.570594179762681e-07, + "loss": 0.0522, + "reward": 1.0535714775323868, + "reward_std": 0.22465689852833748, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.9531250298023224, + "step": 1121 + }, + { + "completion_length": 1058.620590209961, + "epoch": 0.33515047419908894, + "grad_norm": 0.5685875415802002, + "kl": 0.46435546875, + "learning_rate": 8.567161376273393e-07, + "loss": 0.0415, + "reward": 1.1004464626312256, + "reward_std": 0.21677861735224724, + "rewards/accuracy_reward": 0.13839286658912897, + "rewards/format_reward": 0.9620536118745804, + "step": 1122 + }, + { + "completion_length": 1006.3058471679688, + "epoch": 0.3354491822866104, + "grad_norm": 0.8609712719917297, + "kl": 0.44482421875, + "learning_rate": 8.563725236013139e-07, + "loss": 0.0571, + "reward": 1.1406250596046448, + "reward_std": 0.2288895584642887, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9776786267757416, + "step": 1123 + }, + { + "completion_length": 1105.4643249511719, + "epoch": 0.3357478903741319, + "grad_norm": 0.838762640953064, + "kl": 0.60205078125, + "learning_rate": 8.560285762720109e-07, + "loss": 0.0732, + "reward": 1.064732164144516, + "reward_std": 0.19506799429655075, + "rewards/accuracy_reward": 0.10937500186264515, + "rewards/format_reward": 0.9553571790456772, + "step": 1124 + }, + { + "completion_length": 1129.0067443847656, + "epoch": 0.33604659846165336, + "grad_norm": 0.6733607649803162, + "kl": 0.54150390625, + "learning_rate": 8.556842960136107e-07, + "loss": 0.0333, + "reward": 1.0848214477300644, + "reward_std": 0.19275897182524204, + "rewards/accuracy_reward": 0.12276786542497575, + "rewards/format_reward": 0.9620536267757416, + "step": 1125 + }, + { + "completion_length": 1128.7188110351562, + "epoch": 0.3363453065491748, + "grad_norm": 0.9934326410293579, + "kl": 0.544921875, + "learning_rate": 8.553396832006568e-07, + "loss": 0.0588, + "reward": 1.209821492433548, + "reward_std": 0.3195957541465759, + "rewards/accuracy_reward": 0.2544642984867096, + "rewards/format_reward": 0.9553571790456772, + "step": 1126 + }, + { + "completion_length": 1210.9420166015625, + "epoch": 0.3366440146366963, + "grad_norm": 0.6955965757369995, + "kl": 0.5966796875, + "learning_rate": 8.54994738208054e-07, + "loss": 0.0356, + "reward": 1.0312500596046448, + "reward_std": 0.19240542501211166, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.9687500447034836, + "step": 1127 + }, + { + "completion_length": 1099.3728485107422, + "epoch": 0.33694272272421777, + "grad_norm": 0.6351243853569031, + "kl": 0.38818359375, + "learning_rate": 8.546494614110688e-07, + "loss": 0.0322, + "reward": 1.0580357611179352, + "reward_std": 0.1527905222028494, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.9665178954601288, + "step": 1128 + }, + { + "completion_length": 1166.5558776855469, + "epoch": 0.33724143081173924, + "grad_norm": 0.5948604941368103, + "kl": 0.5546875, + "learning_rate": 8.543038531853285e-07, + "loss": 0.0608, + "reward": 1.1227679252624512, + "reward_std": 0.1644816007465124, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.9709821790456772, + "step": 1129 + }, + { + "completion_length": 1124.6205749511719, + "epoch": 0.3375401388992607, + "grad_norm": 1.205372929573059, + "kl": 0.453125, + "learning_rate": 8.539579139068207e-07, + "loss": 0.0497, + "reward": 1.1830357909202576, + "reward_std": 0.29408638924360275, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.9642857611179352, + "step": 1130 + }, + { + "completion_length": 1155.4777221679688, + "epoch": 0.3378388469867822, + "grad_norm": 0.8451985120773315, + "kl": 0.44189453125, + "learning_rate": 8.536116439518938e-07, + "loss": 0.0556, + "reward": 1.07589291036129, + "reward_std": 0.22808865830302238, + "rewards/accuracy_reward": 0.12053572339937091, + "rewards/format_reward": 0.9553571790456772, + "step": 1131 + }, + { + "completion_length": 1063.4397735595703, + "epoch": 0.33813755507430365, + "grad_norm": 0.4778437912464142, + "kl": 0.36669921875, + "learning_rate": 8.532650436972555e-07, + "loss": 0.034, + "reward": 1.1049107611179352, + "reward_std": 0.18759458139538765, + "rewards/accuracy_reward": 0.12500000419095159, + "rewards/format_reward": 0.979910746216774, + "step": 1132 + }, + { + "completion_length": 1230.7857666015625, + "epoch": 0.3384362631618251, + "grad_norm": 1.1331524848937988, + "kl": 0.41796875, + "learning_rate": 8.529181135199726e-07, + "loss": 0.0866, + "reward": 1.1830357611179352, + "reward_std": 0.2579590007662773, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.9620536118745804, + "step": 1133 + }, + { + "completion_length": 1189.6228332519531, + "epoch": 0.3387349712493466, + "grad_norm": 0.7806586623191833, + "kl": 0.50634765625, + "learning_rate": 8.525708537974715e-07, + "loss": 0.0489, + "reward": 1.0491072237491608, + "reward_std": 0.23005883395671844, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.9642857611179352, + "step": 1134 + }, + { + "completion_length": 1076.4531860351562, + "epoch": 0.33903367933686807, + "grad_norm": 0.8146312236785889, + "kl": 0.29736328125, + "learning_rate": 8.522232649075366e-07, + "loss": 0.0354, + "reward": 1.1517857909202576, + "reward_std": 0.21327360905706882, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9754464775323868, + "step": 1135 + }, + { + "completion_length": 1059.776840209961, + "epoch": 0.33933238742438954, + "grad_norm": 0.8246979117393494, + "kl": 0.376953125, + "learning_rate": 8.518753472283105e-07, + "loss": 0.0625, + "reward": 1.1383928954601288, + "reward_std": 0.23596473596990108, + "rewards/accuracy_reward": 0.16517858067527413, + "rewards/format_reward": 0.973214328289032, + "step": 1136 + }, + { + "completion_length": 1071.450942993164, + "epoch": 0.339631095511911, + "grad_norm": 0.7693976759910583, + "kl": 0.42822265625, + "learning_rate": 8.515271011382937e-07, + "loss": 0.0323, + "reward": 1.113839328289032, + "reward_std": 0.1802177932113409, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.9732143133878708, + "step": 1137 + }, + { + "completion_length": 1131.745590209961, + "epoch": 0.3399298035994325, + "grad_norm": 0.49670061469078064, + "kl": 0.35595703125, + "learning_rate": 8.511785270163436e-07, + "loss": 0.0441, + "reward": 1.082589328289032, + "reward_std": 0.1697205901145935, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.9754464477300644, + "step": 1138 + }, + { + "completion_length": 1073.3795013427734, + "epoch": 0.34022851168695395, + "grad_norm": 3.214099407196045, + "kl": 0.4306640625, + "learning_rate": 8.508296252416748e-07, + "loss": 0.0179, + "reward": 1.1517857313156128, + "reward_std": 0.13186875730752945, + "rewards/accuracy_reward": 0.15848215040750802, + "rewards/format_reward": 0.9933035969734192, + "step": 1139 + }, + { + "completion_length": 1091.2790985107422, + "epoch": 0.3405272197744754, + "grad_norm": 0.7277560830116272, + "kl": 0.4765625, + "learning_rate": 8.504803961938582e-07, + "loss": 0.0171, + "reward": 1.082589328289032, + "reward_std": 0.20670123398303986, + "rewards/accuracy_reward": 0.12276786053553224, + "rewards/format_reward": 0.9598214626312256, + "step": 1140 + }, + { + "completion_length": 1156.65185546875, + "epoch": 0.3408259278619969, + "grad_norm": 0.8529951572418213, + "kl": 0.52294921875, + "learning_rate": 8.501308402528207e-07, + "loss": 0.0586, + "reward": 1.1986607611179352, + "reward_std": 0.3120819628238678, + "rewards/accuracy_reward": 0.2656250111758709, + "rewards/format_reward": 0.933035746216774, + "step": 1141 + }, + { + "completion_length": 1086.3326416015625, + "epoch": 0.3411246359495183, + "grad_norm": 1.0592271089553833, + "kl": 0.34375, + "learning_rate": 8.497809577988451e-07, + "loss": 0.0234, + "reward": 1.0156250298023224, + "reward_std": 0.12827018462121487, + "rewards/accuracy_reward": 0.037946428870782256, + "rewards/format_reward": 0.9776786118745804, + "step": 1142 + }, + { + "completion_length": 983.1339569091797, + "epoch": 0.3414233440370398, + "grad_norm": 0.6102848649024963, + "kl": 0.338623046875, + "learning_rate": 8.494307492125691e-07, + "loss": 0.0364, + "reward": 1.1272321939468384, + "reward_std": 0.1531907059252262, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.9843750596046448, + "step": 1143 + }, + { + "completion_length": 961.6094055175781, + "epoch": 0.34172205212456125, + "grad_norm": 0.8994734287261963, + "kl": 0.3017578125, + "learning_rate": 8.490802148749853e-07, + "loss": 0.0752, + "reward": 1.2209822237491608, + "reward_std": 0.2661811225116253, + "rewards/accuracy_reward": 0.243303582072258, + "rewards/format_reward": 0.9776785969734192, + "step": 1144 + }, + { + "completion_length": 1085.1585388183594, + "epoch": 0.3420207602120827, + "grad_norm": 1.0906902551651, + "kl": 0.396484375, + "learning_rate": 8.487293551674406e-07, + "loss": 0.0655, + "reward": 1.1495536267757416, + "reward_std": 0.24095037020742893, + "rewards/accuracy_reward": 0.19866072479635477, + "rewards/format_reward": 0.9508928954601288, + "step": 1145 + }, + { + "completion_length": 1068.8973693847656, + "epoch": 0.3423194682996042, + "grad_norm": 0.4961259365081787, + "kl": 0.33984375, + "learning_rate": 8.483781704716363e-07, + "loss": 0.0303, + "reward": 1.0334822088479996, + "reward_std": 0.12034640926867723, + "rewards/accuracy_reward": 0.05803571850992739, + "rewards/format_reward": 0.9754464477300644, + "step": 1146 + }, + { + "completion_length": 1019.9464874267578, + "epoch": 0.34261817638712566, + "grad_norm": 0.75471431016922, + "kl": 0.326171875, + "learning_rate": 8.480266611696266e-07, + "loss": 0.0021, + "reward": 1.1875000596046448, + "reward_std": 0.18875562772154808, + "rewards/accuracy_reward": 0.21428573317825794, + "rewards/format_reward": 0.973214328289032, + "step": 1147 + }, + { + "completion_length": 1110.9197082519531, + "epoch": 0.34291688447464713, + "grad_norm": 0.7604905366897583, + "kl": 0.36572265625, + "learning_rate": 8.476748276438194e-07, + "loss": 0.0606, + "reward": 1.2946429252624512, + "reward_std": 0.2672804296016693, + "rewards/accuracy_reward": 0.3191964440047741, + "rewards/format_reward": 0.9754464477300644, + "step": 1148 + }, + { + "completion_length": 1081.9152221679688, + "epoch": 0.3432155925621686, + "grad_norm": 0.7162444591522217, + "kl": 0.4296875, + "learning_rate": 8.473226702769749e-07, + "loss": 0.0419, + "reward": 1.147321492433548, + "reward_std": 0.233627337962389, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.9709821790456772, + "step": 1149 + }, + { + "completion_length": 1060.8304138183594, + "epoch": 0.3435143006496901, + "grad_norm": 0.7864062786102295, + "kl": 0.293701171875, + "learning_rate": 8.46970189452206e-07, + "loss": 0.0514, + "reward": 1.1584821939468384, + "reward_std": 0.23201031424105167, + "rewards/accuracy_reward": 0.18973214668221772, + "rewards/format_reward": 0.9687500447034836, + "step": 1150 + }, + { + "completion_length": 1068.0245971679688, + "epoch": 0.34381300873721155, + "grad_norm": 0.6454563140869141, + "kl": 0.38623046875, + "learning_rate": 8.46617385552977e-07, + "loss": 0.0538, + "reward": 1.0691964775323868, + "reward_std": 0.19419069774448872, + "rewards/accuracy_reward": 0.1116071455180645, + "rewards/format_reward": 0.957589328289032, + "step": 1151 + }, + { + "completion_length": 1132.5201416015625, + "epoch": 0.344111716824733, + "grad_norm": 0.7339260578155518, + "kl": 0.43603515625, + "learning_rate": 8.462642589631044e-07, + "loss": 0.0535, + "reward": 1.0468750596046448, + "reward_std": 0.22927706316113472, + "rewards/accuracy_reward": 0.09821428777649999, + "rewards/format_reward": 0.9486607611179352, + "step": 1152 + }, + { + "completion_length": 1059.0268249511719, + "epoch": 0.3444104249122545, + "grad_norm": 0.7450567483901978, + "kl": 0.4931640625, + "learning_rate": 8.459108100667548e-07, + "loss": 0.0678, + "reward": 1.1718750447034836, + "reward_std": 0.23394886776804924, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9486607611179352, + "step": 1153 + }, + { + "completion_length": 1008.1585235595703, + "epoch": 0.34470913299977596, + "grad_norm": 0.9061623811721802, + "kl": 0.3310546875, + "learning_rate": 8.455570392484464e-07, + "loss": 0.0426, + "reward": 1.207589328289032, + "reward_std": 0.19008641503751278, + "rewards/accuracy_reward": 0.23883929569274187, + "rewards/format_reward": 0.9687500596046448, + "step": 1154 + }, + { + "completion_length": 984.6228179931641, + "epoch": 0.34500784108729743, + "grad_norm": 0.6010558605194092, + "kl": 0.375, + "learning_rate": 8.45202946893047e-07, + "loss": 0.0353, + "reward": 1.1339286267757416, + "reward_std": 0.23246139660477638, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9620536118745804, + "step": 1155 + }, + { + "completion_length": 1105.2500762939453, + "epoch": 0.3453065491748189, + "grad_norm": 0.6614886522293091, + "kl": 0.421875, + "learning_rate": 8.448485333857742e-07, + "loss": 0.0615, + "reward": 1.1428571939468384, + "reward_std": 0.17714253813028336, + "rewards/accuracy_reward": 0.1696428705472499, + "rewards/format_reward": 0.9732143431901932, + "step": 1156 + }, + { + "completion_length": 1091.5669860839844, + "epoch": 0.34560525726234037, + "grad_norm": 0.7097234129905701, + "kl": 0.4931640625, + "learning_rate": 8.444937991121956e-07, + "loss": 0.0463, + "reward": 1.0959821939468384, + "reward_std": 0.21734417416155338, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.9754464775323868, + "step": 1157 + }, + { + "completion_length": 1049.2210388183594, + "epoch": 0.34590396534986184, + "grad_norm": 0.7650419473648071, + "kl": 0.47900390625, + "learning_rate": 8.44138744458227e-07, + "loss": 0.0699, + "reward": 1.238839328289032, + "reward_std": 0.22358405962586403, + "rewards/accuracy_reward": 0.2656250149011612, + "rewards/format_reward": 0.9732143133878708, + "step": 1158 + }, + { + "completion_length": 1005.2455902099609, + "epoch": 0.3462026734373833, + "grad_norm": 1.0526769161224365, + "kl": 0.548828125, + "learning_rate": 8.437833698101331e-07, + "loss": 0.0645, + "reward": 1.0803571939468384, + "reward_std": 0.2443988285958767, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.9375000298023224, + "step": 1159 + }, + { + "completion_length": 1048.7143096923828, + "epoch": 0.3465013815249048, + "grad_norm": 0.6835527420043945, + "kl": 0.3779296875, + "learning_rate": 8.434276755545265e-07, + "loss": 0.0251, + "reward": 1.104910746216774, + "reward_std": 0.09521150682121515, + "rewards/accuracy_reward": 0.12500000488944352, + "rewards/format_reward": 0.9799107313156128, + "step": 1160 + }, + { + "completion_length": 1093.1830749511719, + "epoch": 0.34680008961242625, + "grad_norm": 0.96452796459198, + "kl": 0.513671875, + "learning_rate": 8.430716620783683e-07, + "loss": 0.0551, + "reward": 1.1227679252624512, + "reward_std": 0.24482593312859535, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.96651791036129, + "step": 1161 + }, + { + "completion_length": 1024.7076416015625, + "epoch": 0.3470987976999477, + "grad_norm": 0.6855654716491699, + "kl": 0.39111328125, + "learning_rate": 8.427153297689654e-07, + "loss": 0.0312, + "reward": 1.1875000596046448, + "reward_std": 0.19138794764876366, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.9732143431901932, + "step": 1162 + }, + { + "completion_length": 1072.075927734375, + "epoch": 0.3473975057874692, + "grad_norm": 0.6116604804992676, + "kl": 0.40966796875, + "learning_rate": 8.423586790139733e-07, + "loss": 0.0221, + "reward": 1.0892857611179352, + "reward_std": 0.18036843463778496, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.9709821790456772, + "step": 1163 + }, + { + "completion_length": 1057.9866485595703, + "epoch": 0.34769621387499067, + "grad_norm": 0.5071328282356262, + "kl": 0.3779296875, + "learning_rate": 8.420017102013924e-07, + "loss": 0.0521, + "reward": 1.051339328289032, + "reward_std": 0.18219483457505703, + "rewards/accuracy_reward": 0.08705357508733869, + "rewards/format_reward": 0.9642857611179352, + "step": 1164 + }, + { + "completion_length": 1022.4241638183594, + "epoch": 0.34799492196251214, + "grad_norm": 0.8048277497291565, + "kl": 0.45166015625, + "learning_rate": 8.416444237195701e-07, + "loss": 0.049, + "reward": 1.1026786416769028, + "reward_std": 0.21175462752580643, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.9665178805589676, + "step": 1165 + }, + { + "completion_length": 960.3906707763672, + "epoch": 0.3482936300500336, + "grad_norm": 0.6260908842086792, + "kl": 0.2666015625, + "learning_rate": 8.41286819957199e-07, + "loss": 0.051, + "reward": 1.285714328289032, + "reward_std": 0.19554875791072845, + "rewards/accuracy_reward": 0.3035714365541935, + "rewards/format_reward": 0.9821428954601288, + "step": 1166 + }, + { + "completion_length": 1090.9152374267578, + "epoch": 0.3485923381375551, + "grad_norm": 0.7859447002410889, + "kl": 0.439453125, + "learning_rate": 8.409288993033171e-07, + "loss": 0.0498, + "reward": 1.1383929252624512, + "reward_std": 0.17350593581795692, + "rewards/accuracy_reward": 0.1629464402794838, + "rewards/format_reward": 0.9754464626312256, + "step": 1167 + }, + { + "completion_length": 1081.5603332519531, + "epoch": 0.34889104622507655, + "grad_norm": 0.49111318588256836, + "kl": 0.276123046875, + "learning_rate": 8.405706621473069e-07, + "loss": 0.0274, + "reward": 1.0602678954601288, + "reward_std": 0.17505836859345436, + "rewards/accuracy_reward": 0.07812500605359674, + "rewards/format_reward": 0.9821428805589676, + "step": 1168 + }, + { + "completion_length": 1035.7746124267578, + "epoch": 0.349189754312598, + "grad_norm": 0.7358346581459045, + "kl": 0.343505859375, + "learning_rate": 8.40212108878895e-07, + "loss": 0.0442, + "reward": 1.0758928954601288, + "reward_std": 0.22469143383204937, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9620536118745804, + "step": 1169 + }, + { + "completion_length": 985.1004943847656, + "epoch": 0.3494884624001195, + "grad_norm": 0.866433322429657, + "kl": 0.296630859375, + "learning_rate": 8.398532398881527e-07, + "loss": 0.048, + "reward": 1.1875000298023224, + "reward_std": 0.18057307228446007, + "rewards/accuracy_reward": 0.20312500931322575, + "rewards/format_reward": 0.9843750447034836, + "step": 1170 + }, + { + "completion_length": 1060.4174499511719, + "epoch": 0.34978717048764096, + "grad_norm": 0.6081607937812805, + "kl": 0.368896484375, + "learning_rate": 8.39494055565494e-07, + "loss": 0.0389, + "reward": 1.1294643580913544, + "reward_std": 0.17213633097708225, + "rewards/accuracy_reward": 0.14508928963914514, + "rewards/format_reward": 0.9843750447034836, + "step": 1171 + }, + { + "completion_length": 1088.3839569091797, + "epoch": 0.35008587857516243, + "grad_norm": 0.7363765835762024, + "kl": 0.40576171875, + "learning_rate": 8.391345563016763e-07, + "loss": 0.0612, + "reward": 1.1897321939468384, + "reward_std": 0.2139531709253788, + "rewards/accuracy_reward": 0.21205357927829027, + "rewards/format_reward": 0.9776786267757416, + "step": 1172 + }, + { + "completion_length": 1102.8504943847656, + "epoch": 0.3503845866626839, + "grad_norm": 0.8428829908370972, + "kl": 0.42626953125, + "learning_rate": 8.387747424877996e-07, + "loss": 0.0702, + "reward": 1.0959822088479996, + "reward_std": 0.1536906287074089, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.9754464626312256, + "step": 1173 + }, + { + "completion_length": 1041.0245971679688, + "epoch": 0.3506832947502054, + "grad_norm": 0.5687955617904663, + "kl": 0.45458984375, + "learning_rate": 8.384146145153059e-07, + "loss": 0.0741, + "reward": 1.136160746216774, + "reward_std": 0.24545044265687466, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9665178954601288, + "step": 1174 + }, + { + "completion_length": 1047.1317291259766, + "epoch": 0.35098200283772685, + "grad_norm": 1.7673550844192505, + "kl": 0.52685546875, + "learning_rate": 8.380541727759794e-07, + "loss": 0.0623, + "reward": 1.2232143580913544, + "reward_std": 0.19512634724378586, + "rewards/accuracy_reward": 0.2477678693830967, + "rewards/format_reward": 0.9754464775323868, + "step": 1175 + }, + { + "completion_length": 1060.5513916015625, + "epoch": 0.3512807109252483, + "grad_norm": 1.150676965713501, + "kl": 0.62158203125, + "learning_rate": 8.376934176619454e-07, + "loss": 0.0477, + "reward": 1.1116071790456772, + "reward_std": 0.22557036951184273, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9665178954601288, + "step": 1176 + }, + { + "completion_length": 1037.8817596435547, + "epoch": 0.3515794190127698, + "grad_norm": 0.8812790513038635, + "kl": 0.52392578125, + "learning_rate": 8.373323495656699e-07, + "loss": 0.0791, + "reward": 1.116071492433548, + "reward_std": 0.23525480553507805, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9531250447034836, + "step": 1177 + }, + { + "completion_length": 960.5714721679688, + "epoch": 0.35187812710029126, + "grad_norm": 0.9912075996398926, + "kl": 0.455078125, + "learning_rate": 8.369709688799596e-07, + "loss": 0.0351, + "reward": 1.2008928954601288, + "reward_std": 0.19733618572354317, + "rewards/accuracy_reward": 0.2276785783469677, + "rewards/format_reward": 0.973214328289032, + "step": 1178 + }, + { + "completion_length": 1150.5692749023438, + "epoch": 0.35217683518781273, + "grad_norm": 1.04091215133667, + "kl": 0.5595703125, + "learning_rate": 8.366092759979612e-07, + "loss": 0.0587, + "reward": 1.071428656578064, + "reward_std": 0.20117651671171188, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.9620536118745804, + "step": 1179 + }, + { + "completion_length": 1029.8348693847656, + "epoch": 0.3524755432753342, + "grad_norm": 0.612055242061615, + "kl": 0.517578125, + "learning_rate": 8.362472713131614e-07, + "loss": 0.0448, + "reward": 1.176339328289032, + "reward_std": 0.12875500600785017, + "rewards/accuracy_reward": 0.19866072502918541, + "rewards/format_reward": 0.9776786118745804, + "step": 1180 + }, + { + "completion_length": 1011.8705902099609, + "epoch": 0.3527742513628557, + "grad_norm": 0.8100084662437439, + "kl": 0.56005859375, + "learning_rate": 8.358849552193857e-07, + "loss": 0.1023, + "reward": 1.207589328289032, + "reward_std": 0.26437849551439285, + "rewards/accuracy_reward": 0.2343750149011612, + "rewards/format_reward": 0.973214328289032, + "step": 1181 + }, + { + "completion_length": 1007.7723693847656, + "epoch": 0.35307295945037714, + "grad_norm": 0.9279620051383972, + "kl": 0.5205078125, + "learning_rate": 8.355223281107985e-07, + "loss": 0.0321, + "reward": 1.1808035969734192, + "reward_std": 0.2218836471438408, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9687500298023224, + "step": 1182 + }, + { + "completion_length": 1028.5625610351562, + "epoch": 0.3533716675378986, + "grad_norm": 1.5612059831619263, + "kl": 0.43505859375, + "learning_rate": 8.351593903819022e-07, + "loss": 0.0318, + "reward": 1.0714286267757416, + "reward_std": 0.2440868876874447, + "rewards/accuracy_reward": 0.10491071920841932, + "rewards/format_reward": 0.9665178954601288, + "step": 1183 + }, + { + "completion_length": 1095.0915832519531, + "epoch": 0.3536703756254201, + "grad_norm": 0.6840655207633972, + "kl": 0.54150390625, + "learning_rate": 8.34796142427538e-07, + "loss": 0.053, + "reward": 1.1004464626312256, + "reward_std": 0.19210810586810112, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.96651791036129, + "step": 1184 + }, + { + "completion_length": 1095.1920318603516, + "epoch": 0.3539690837129415, + "grad_norm": 0.8756506443023682, + "kl": 0.5361328125, + "learning_rate": 8.344325846428839e-07, + "loss": 0.0636, + "reward": 0.9933036118745804, + "reward_std": 0.2092122808098793, + "rewards/accuracy_reward": 0.03348214365541935, + "rewards/format_reward": 0.9598214775323868, + "step": 1185 + }, + { + "completion_length": 1110.8750610351562, + "epoch": 0.35426779180046297, + "grad_norm": 1.3235447406768799, + "kl": 0.52734375, + "learning_rate": 8.340687174234551e-07, + "loss": 0.0644, + "reward": 1.058035746216774, + "reward_std": 0.2008441425859928, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.9575893133878708, + "step": 1186 + }, + { + "completion_length": 1081.6272735595703, + "epoch": 0.35456649988798444, + "grad_norm": 0.7303282022476196, + "kl": 0.4404296875, + "learning_rate": 8.337045411651034e-07, + "loss": 0.0253, + "reward": 1.1116071939468384, + "reward_std": 0.2295202985405922, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.9776786118745804, + "step": 1187 + }, + { + "completion_length": 1105.9732666015625, + "epoch": 0.3548652079755059, + "grad_norm": 1.1480505466461182, + "kl": 0.48046875, + "learning_rate": 8.333400562640172e-07, + "loss": 0.0792, + "reward": 1.084821492433548, + "reward_std": 0.22670630738139153, + "rewards/accuracy_reward": 0.11607143515720963, + "rewards/format_reward": 0.9687500447034836, + "step": 1188 + }, + { + "completion_length": 952.7745819091797, + "epoch": 0.3551639160630274, + "grad_norm": 0.9043588042259216, + "kl": 0.3623046875, + "learning_rate": 8.329752631167197e-07, + "loss": 0.0221, + "reward": 1.212053656578064, + "reward_std": 0.1913885734975338, + "rewards/accuracy_reward": 0.22767858766019344, + "rewards/format_reward": 0.9843750298023224, + "step": 1189 + }, + { + "completion_length": 962.7723541259766, + "epoch": 0.35546262415054886, + "grad_norm": 1.180556297302246, + "kl": 0.445556640625, + "learning_rate": 8.326101621200706e-07, + "loss": 0.0541, + "reward": 1.1941964626312256, + "reward_std": 0.1772399265319109, + "rewards/accuracy_reward": 0.2165178742725402, + "rewards/format_reward": 0.9776786118745804, + "step": 1190 + }, + { + "completion_length": 1002.3080749511719, + "epoch": 0.3557613322380703, + "grad_norm": 0.9585791230201721, + "kl": 0.478515625, + "learning_rate": 8.322447536712642e-07, + "loss": 0.0453, + "reward": 1.207589328289032, + "reward_std": 0.24638578295707703, + "rewards/accuracy_reward": 0.2433035857975483, + "rewards/format_reward": 0.9642857611179352, + "step": 1191 + }, + { + "completion_length": 1091.0000305175781, + "epoch": 0.3560600403255918, + "grad_norm": 0.8871986269950867, + "kl": 0.5546875, + "learning_rate": 8.318790381678283e-07, + "loss": 0.0726, + "reward": 1.1227679252624512, + "reward_std": 0.21378014609217644, + "rewards/accuracy_reward": 0.15848215110599995, + "rewards/format_reward": 0.9642857611179352, + "step": 1192 + }, + { + "completion_length": 1044.9732666015625, + "epoch": 0.35635874841311327, + "grad_norm": 0.8819432258605957, + "kl": 0.56982421875, + "learning_rate": 8.315130160076263e-07, + "loss": 0.0349, + "reward": 1.1406250596046448, + "reward_std": 0.16071583330631256, + "rewards/accuracy_reward": 0.16517858393490314, + "rewards/format_reward": 0.9754464775323868, + "step": 1193 + }, + { + "completion_length": 1002.9196929931641, + "epoch": 0.35665745650063474, + "grad_norm": 0.6557120680809021, + "kl": 0.43603515625, + "learning_rate": 8.311466875888539e-07, + "loss": 0.0305, + "reward": 1.2232143580913544, + "reward_std": 0.15697832591831684, + "rewards/accuracy_reward": 0.23883929662406445, + "rewards/format_reward": 0.9843750447034836, + "step": 1194 + }, + { + "completion_length": 961.9598693847656, + "epoch": 0.3569561645881562, + "grad_norm": 1.1811113357543945, + "kl": 0.333984375, + "learning_rate": 8.307800533100409e-07, + "loss": 0.0045, + "reward": 1.1718750596046448, + "reward_std": 0.17029685527086258, + "rewards/accuracy_reward": 0.20758929662406445, + "rewards/format_reward": 0.9642857611179352, + "step": 1195 + }, + { + "completion_length": 1090.1697082519531, + "epoch": 0.3572548726756777, + "grad_norm": 0.7915575504302979, + "kl": 0.364501953125, + "learning_rate": 8.304131135700493e-07, + "loss": 0.0544, + "reward": 1.113839328289032, + "reward_std": 0.2321111410856247, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9687500447034836, + "step": 1196 + }, + { + "completion_length": 1017.6697082519531, + "epoch": 0.35755358076319915, + "grad_norm": 0.5951302647590637, + "kl": 0.318359375, + "learning_rate": 8.300458687680736e-07, + "loss": 0.0327, + "reward": 1.1651786267757416, + "reward_std": 0.19971517473459244, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9687500447034836, + "step": 1197 + }, + { + "completion_length": 900.0513763427734, + "epoch": 0.3578522888507206, + "grad_norm": 0.9399256110191345, + "kl": 0.28515625, + "learning_rate": 8.296783193036399e-07, + "loss": 0.0333, + "reward": 1.2589286267757416, + "reward_std": 0.1685355007648468, + "rewards/accuracy_reward": 0.2790178656578064, + "rewards/format_reward": 0.979910746216774, + "step": 1198 + }, + { + "completion_length": 971.6964569091797, + "epoch": 0.3581509969382421, + "grad_norm": 0.9653859734535217, + "kl": 0.375, + "learning_rate": 8.293104655766066e-07, + "loss": 0.0481, + "reward": 1.1406250596046448, + "reward_std": 0.2099161297082901, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9776786118745804, + "step": 1199 + }, + { + "completion_length": 1029.0379791259766, + "epoch": 0.35844970502576357, + "grad_norm": 0.9414172172546387, + "kl": 0.306640625, + "learning_rate": 8.289423079871618e-07, + "loss": 0.0524, + "reward": 1.1071428954601288, + "reward_std": 0.16777200996875763, + "rewards/accuracy_reward": 0.13839286309666932, + "rewards/format_reward": 0.9687500447034836, + "step": 1200 + }, + { + "completion_length": 1007.8482971191406, + "epoch": 0.35874841311328504, + "grad_norm": 0.5274280905723572, + "kl": 0.35107421875, + "learning_rate": 8.285738469358253e-07, + "loss": 0.0312, + "reward": 1.2232143580913544, + "reward_std": 0.2164122313261032, + "rewards/accuracy_reward": 0.2433035783469677, + "rewards/format_reward": 0.979910746216774, + "step": 1201 + }, + { + "completion_length": 952.3259429931641, + "epoch": 0.3590471212008065, + "grad_norm": 1.2211390733718872, + "kl": 0.376953125, + "learning_rate": 8.282050828234464e-07, + "loss": 0.06, + "reward": 1.0870536267757416, + "reward_std": 0.18894468620419502, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.9687500298023224, + "step": 1202 + }, + { + "completion_length": 1076.7947082519531, + "epoch": 0.359345829288328, + "grad_norm": 0.727620005607605, + "kl": 0.373046875, + "learning_rate": 8.278360160512046e-07, + "loss": 0.0319, + "reward": 1.055803656578064, + "reward_std": 0.20886290818452835, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.9598214775323868, + "step": 1203 + }, + { + "completion_length": 1081.4308471679688, + "epoch": 0.35964453737584945, + "grad_norm": 0.6997358202934265, + "kl": 0.48486328125, + "learning_rate": 8.27466647020608e-07, + "loss": 0.0292, + "reward": 1.051339328289032, + "reward_std": 0.20242588222026825, + "rewards/accuracy_reward": 0.09151785750873387, + "rewards/format_reward": 0.9598214775323868, + "step": 1204 + }, + { + "completion_length": 1022.7902069091797, + "epoch": 0.3599432454633709, + "grad_norm": 0.8327034711837769, + "kl": 0.49169921875, + "learning_rate": 8.270969761334944e-07, + "loss": 0.0468, + "reward": 1.1294643580913544, + "reward_std": 0.21040759608149529, + "rewards/accuracy_reward": 0.15178571827709675, + "rewards/format_reward": 0.9776786118745804, + "step": 1205 + }, + { + "completion_length": 1018.0893249511719, + "epoch": 0.3602419535508924, + "grad_norm": 0.6316136121749878, + "kl": 0.43798828125, + "learning_rate": 8.267270037920288e-07, + "loss": 0.0695, + "reward": 1.082589328289032, + "reward_std": 0.2052907571196556, + "rewards/accuracy_reward": 0.11607143213041127, + "rewards/format_reward": 0.96651791036129, + "step": 1206 + }, + { + "completion_length": 1000.9598693847656, + "epoch": 0.36054066163841386, + "grad_norm": 0.778264045715332, + "kl": 0.458984375, + "learning_rate": 8.263567303987056e-07, + "loss": 0.0339, + "reward": 1.0825893580913544, + "reward_std": 0.2783246599137783, + "rewards/accuracy_reward": 0.1138392873108387, + "rewards/format_reward": 0.9687500447034836, + "step": 1207 + }, + { + "completion_length": 984.6920318603516, + "epoch": 0.36083936972593533, + "grad_norm": 0.5087599158287048, + "kl": 0.41845703125, + "learning_rate": 8.259861563563453e-07, + "loss": 0.0536, + "reward": 1.080357164144516, + "reward_std": 0.1763086998835206, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9776786118745804, + "step": 1208 + }, + { + "completion_length": 995.3906860351562, + "epoch": 0.3611380778134568, + "grad_norm": 0.6215347647666931, + "kl": 0.4931640625, + "learning_rate": 8.256152820680967e-07, + "loss": 0.0725, + "reward": 1.1540178954601288, + "reward_std": 0.25835032016038895, + "rewards/accuracy_reward": 0.19196428917348385, + "rewards/format_reward": 0.9620536118745804, + "step": 1209 + }, + { + "completion_length": 1035.8661193847656, + "epoch": 0.3614367859009783, + "grad_norm": 1.3934637308120728, + "kl": 0.44482421875, + "learning_rate": 8.252441079374342e-07, + "loss": 0.0528, + "reward": 1.1138392984867096, + "reward_std": 0.250568438321352, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9754464775323868, + "step": 1210 + }, + { + "completion_length": 1062.8393249511719, + "epoch": 0.36173549398849975, + "grad_norm": 0.7227892279624939, + "kl": 0.48583984375, + "learning_rate": 8.248726343681591e-07, + "loss": 0.0583, + "reward": 1.1250000298023224, + "reward_std": 0.24407900869846344, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.9665178954601288, + "step": 1211 + }, + { + "completion_length": 976.1429138183594, + "epoch": 0.3620342020760212, + "grad_norm": 0.6005772352218628, + "kl": 0.54736328125, + "learning_rate": 8.245008617643984e-07, + "loss": 0.0385, + "reward": 1.1830357909202576, + "reward_std": 0.21047092974185944, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9709821939468384, + "step": 1212 + }, + { + "completion_length": 1032.0000610351562, + "epoch": 0.3623329101635427, + "grad_norm": 0.7988520264625549, + "kl": 0.560546875, + "learning_rate": 8.241287905306038e-07, + "loss": 0.0248, + "reward": 1.2633928954601288, + "reward_std": 0.20787621103227139, + "rewards/accuracy_reward": 0.2946428805589676, + "rewards/format_reward": 0.9687500447034836, + "step": 1213 + }, + { + "completion_length": 950.7790679931641, + "epoch": 0.36263161825106416, + "grad_norm": 0.6264091730117798, + "kl": 0.421630859375, + "learning_rate": 8.237564210715528e-07, + "loss": 0.0468, + "reward": 1.1049107909202576, + "reward_std": 0.18522972241044044, + "rewards/accuracy_reward": 0.13169643515720963, + "rewards/format_reward": 0.973214328289032, + "step": 1214 + }, + { + "completion_length": 938.2232666015625, + "epoch": 0.36293032633858563, + "grad_norm": 0.9363739490509033, + "kl": 0.3603515625, + "learning_rate": 8.233837537923467e-07, + "loss": 0.0392, + "reward": 1.2455357611179352, + "reward_std": 0.24871613830327988, + "rewards/accuracy_reward": 0.2589285857975483, + "rewards/format_reward": 0.9866071939468384, + "step": 1215 + }, + { + "completion_length": 1064.7344055175781, + "epoch": 0.3632290344261071, + "grad_norm": 1.1959145069122314, + "kl": 0.50927734375, + "learning_rate": 8.230107890984109e-07, + "loss": 0.0527, + "reward": 1.0915178954601288, + "reward_std": 0.2237197943031788, + "rewards/accuracy_reward": 0.12946428963914514, + "rewards/format_reward": 0.9620536118745804, + "step": 1216 + }, + { + "completion_length": 1086.3460693359375, + "epoch": 0.36352774251362857, + "grad_norm": 0.6153210997581482, + "kl": 0.445556640625, + "learning_rate": 8.226375273954945e-07, + "loss": 0.0553, + "reward": 1.0915178954601288, + "reward_std": 0.19900260865688324, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.973214328289032, + "step": 1217 + }, + { + "completion_length": 993.9643402099609, + "epoch": 0.36382645060115004, + "grad_norm": 0.964860200881958, + "kl": 0.47412109375, + "learning_rate": 8.222639690896698e-07, + "loss": 0.049, + "reward": 1.1919643580913544, + "reward_std": 0.2113104835152626, + "rewards/accuracy_reward": 0.2098214402794838, + "rewards/format_reward": 0.9821428805589676, + "step": 1218 + }, + { + "completion_length": 1067.8616638183594, + "epoch": 0.3641251586886715, + "grad_norm": 0.8853161334991455, + "kl": 0.44189453125, + "learning_rate": 8.218901145873312e-07, + "loss": 0.0661, + "reward": 1.100446492433548, + "reward_std": 0.18362212553620338, + "rewards/accuracy_reward": 0.12946429033763707, + "rewards/format_reward": 0.9709821790456772, + "step": 1219 + }, + { + "completion_length": 986.4777221679688, + "epoch": 0.364423866776193, + "grad_norm": 0.7878938913345337, + "kl": 0.41455078125, + "learning_rate": 8.215159642951962e-07, + "loss": 0.0455, + "reward": 1.209821492433548, + "reward_std": 0.16762839443981647, + "rewards/accuracy_reward": 0.2299107201397419, + "rewards/format_reward": 0.9799107611179352, + "step": 1220 + }, + { + "completion_length": 1105.636215209961, + "epoch": 0.36472257486371445, + "grad_norm": 0.7267536520957947, + "kl": 0.63818359375, + "learning_rate": 8.211415186203033e-07, + "loss": 0.0549, + "reward": 1.0915179252624512, + "reward_std": 0.2733280286192894, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.964285746216774, + "step": 1221 + }, + { + "completion_length": 1075.4732513427734, + "epoch": 0.3650212829512359, + "grad_norm": 0.4580187201499939, + "kl": 0.466796875, + "learning_rate": 8.207667779700131e-07, + "loss": 0.0496, + "reward": 1.1361607909202576, + "reward_std": 0.24079784750938416, + "rewards/accuracy_reward": 0.16517858114093542, + "rewards/format_reward": 0.9709821939468384, + "step": 1222 + }, + { + "completion_length": 1050.7433471679688, + "epoch": 0.3653199910387574, + "grad_norm": 0.6939496994018555, + "kl": 0.430419921875, + "learning_rate": 8.203917427520064e-07, + "loss": 0.0309, + "reward": 1.0803572237491608, + "reward_std": 0.13149765133857727, + "rewards/accuracy_reward": 0.09598215017467737, + "rewards/format_reward": 0.9843750447034836, + "step": 1223 + }, + { + "completion_length": 1088.654052734375, + "epoch": 0.36561869912627887, + "grad_norm": 0.9480755925178528, + "kl": 0.5625, + "learning_rate": 8.200164133742847e-07, + "loss": 0.0547, + "reward": 1.1830357611179352, + "reward_std": 0.2182910367846489, + "rewards/accuracy_reward": 0.20089286426082253, + "rewards/format_reward": 0.9821428805589676, + "step": 1224 + }, + { + "completion_length": 1078.2120971679688, + "epoch": 0.36591740721380034, + "grad_norm": 1.0232263803482056, + "kl": 0.41650390625, + "learning_rate": 8.196407902451699e-07, + "loss": 0.0719, + "reward": 1.0558036267757416, + "reward_std": 0.1951271966099739, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.9598214775323868, + "step": 1225 + }, + { + "completion_length": 991.3951416015625, + "epoch": 0.3662161153013218, + "grad_norm": 0.7625141143798828, + "kl": 0.4140625, + "learning_rate": 8.192648737733026e-07, + "loss": 0.0393, + "reward": 1.1383928954601288, + "reward_std": 0.204526636749506, + "rewards/accuracy_reward": 0.16517857648432255, + "rewards/format_reward": 0.973214328289032, + "step": 1226 + }, + { + "completion_length": 1107.1629791259766, + "epoch": 0.3665148233888433, + "grad_norm": 1.393204927444458, + "kl": 0.319580078125, + "learning_rate": 8.188886643676438e-07, + "loss": 0.0636, + "reward": 1.2343750596046448, + "reward_std": 0.25439638644456863, + "rewards/accuracy_reward": 0.2633928693830967, + "rewards/format_reward": 0.9709821939468384, + "step": 1227 + }, + { + "completion_length": 1019.1138610839844, + "epoch": 0.3668135314763647, + "grad_norm": 0.7661411762237549, + "kl": 0.37353515625, + "learning_rate": 8.185121624374719e-07, + "loss": 0.0118, + "reward": 1.189732164144516, + "reward_std": 0.18834521621465683, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.9799107611179352, + "step": 1228 + }, + { + "completion_length": 1071.3906707763672, + "epoch": 0.36711223956388617, + "grad_norm": 0.8327420353889465, + "kl": 0.401611328125, + "learning_rate": 8.181353683923844e-07, + "loss": 0.0657, + "reward": 1.2299107611179352, + "reward_std": 0.29901695996522903, + "rewards/accuracy_reward": 0.2566964402794838, + "rewards/format_reward": 0.9732143133878708, + "step": 1229 + }, + { + "completion_length": 981.3817291259766, + "epoch": 0.36741094765140764, + "grad_norm": 1.790602684020996, + "kl": 0.35791015625, + "learning_rate": 8.177582826422961e-07, + "loss": 0.0946, + "reward": 1.212053656578064, + "reward_std": 0.25146325677633286, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.9754464626312256, + "step": 1230 + }, + { + "completion_length": 967.7678833007812, + "epoch": 0.3677096557389291, + "grad_norm": 0.5083703994750977, + "kl": 0.4501953125, + "learning_rate": 8.173809055974394e-07, + "loss": 0.0748, + "reward": 1.1495535969734192, + "reward_std": 0.19871985912322998, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9687500298023224, + "step": 1231 + }, + { + "completion_length": 999.9754943847656, + "epoch": 0.3680083638264506, + "grad_norm": 0.9127557873725891, + "kl": 0.5048828125, + "learning_rate": 8.170032376683637e-07, + "loss": 0.0748, + "reward": 1.0625000596046448, + "reward_std": 0.2107374258339405, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.9598214626312256, + "step": 1232 + }, + { + "completion_length": 905.8036041259766, + "epoch": 0.36830707191397205, + "grad_norm": 0.6715912818908691, + "kl": 0.58935546875, + "learning_rate": 8.166252792659344e-07, + "loss": 0.0866, + "reward": 1.1964286267757416, + "reward_std": 0.14760761708021164, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.9799107611179352, + "step": 1233 + }, + { + "completion_length": 1044.2388916015625, + "epoch": 0.3686057800014935, + "grad_norm": 1.2166965007781982, + "kl": 0.626953125, + "learning_rate": 8.162470308013332e-07, + "loss": 0.0791, + "reward": 1.1674107611179352, + "reward_std": 0.2903943993151188, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9553571790456772, + "step": 1234 + }, + { + "completion_length": 953.1585235595703, + "epoch": 0.368904488089015, + "grad_norm": 0.9831138253211975, + "kl": 0.701171875, + "learning_rate": 8.158684926860579e-07, + "loss": 0.0793, + "reward": 1.1160714626312256, + "reward_std": 0.16760787460952997, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.9754464626312256, + "step": 1235 + }, + { + "completion_length": 1016.5647735595703, + "epoch": 0.36920319617653646, + "grad_norm": 1.504720687866211, + "kl": 0.6748046875, + "learning_rate": 8.154896653319202e-07, + "loss": 0.0444, + "reward": 1.035714328289032, + "reward_std": 0.1999862790107727, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.9598214775323868, + "step": 1236 + }, + { + "completion_length": 982.2098693847656, + "epoch": 0.36950190426405793, + "grad_norm": 1.8288413286209106, + "kl": 0.7705078125, + "learning_rate": 8.151105491510473e-07, + "loss": 0.0443, + "reward": 1.1562500596046448, + "reward_std": 0.18977796286344528, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9754464775323868, + "step": 1237 + }, + { + "completion_length": 1023.9018249511719, + "epoch": 0.3698006123515794, + "grad_norm": 1.9564098119735718, + "kl": 0.8056640625, + "learning_rate": 8.147311445558807e-07, + "loss": 0.0533, + "reward": 1.145089328289032, + "reward_std": 0.19585933536291122, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9687500298023224, + "step": 1238 + }, + { + "completion_length": 927.3951263427734, + "epoch": 0.3700993204391009, + "grad_norm": 1.0542443990707397, + "kl": 0.634765625, + "learning_rate": 8.143514519591754e-07, + "loss": 0.04, + "reward": 1.1562500596046448, + "reward_std": 0.1976831890642643, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.9665178954601288, + "step": 1239 + }, + { + "completion_length": 1004.3616638183594, + "epoch": 0.37039802852662235, + "grad_norm": 0.9379754066467285, + "kl": 0.62744140625, + "learning_rate": 8.139714717739993e-07, + "loss": 0.0427, + "reward": 1.162946492433548, + "reward_std": 0.18116367422044277, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9754464626312256, + "step": 1240 + }, + { + "completion_length": 1038.7143096923828, + "epoch": 0.3706967366141438, + "grad_norm": 1.1411511898040771, + "kl": 0.5771484375, + "learning_rate": 8.135912044137342e-07, + "loss": 0.0385, + "reward": 1.0178571790456772, + "reward_std": 0.16423538140952587, + "rewards/accuracy_reward": 0.05803571757860482, + "rewards/format_reward": 0.9598214626312256, + "step": 1241 + }, + { + "completion_length": 998.7768249511719, + "epoch": 0.3709954447016653, + "grad_norm": 0.8514959812164307, + "kl": 0.4443359375, + "learning_rate": 8.132106502920733e-07, + "loss": 0.0353, + "reward": 1.1004464626312256, + "reward_std": 0.25410038977861404, + "rewards/accuracy_reward": 0.13839286006987095, + "rewards/format_reward": 0.9620535969734192, + "step": 1242 + }, + { + "completion_length": 1066.6875457763672, + "epoch": 0.37129415278918676, + "grad_norm": 0.6017686128616333, + "kl": 0.36669921875, + "learning_rate": 8.128298098230222e-07, + "loss": 0.0473, + "reward": 1.1071429252624512, + "reward_std": 0.19125007465481758, + "rewards/accuracy_reward": 0.12500000465661287, + "rewards/format_reward": 0.9821428805589676, + "step": 1243 + }, + { + "completion_length": 1104.8170166015625, + "epoch": 0.37159286087670823, + "grad_norm": 0.5053695440292358, + "kl": 0.3818359375, + "learning_rate": 8.124486834208981e-07, + "loss": 0.0346, + "reward": 1.0803571939468384, + "reward_std": 0.18856855295598507, + "rewards/accuracy_reward": 0.11383928661234677, + "rewards/format_reward": 0.9665178954601288, + "step": 1244 + }, + { + "completion_length": 1230.7545166015625, + "epoch": 0.3718915689642297, + "grad_norm": 0.5615864396095276, + "kl": 0.3017578125, + "learning_rate": 8.120672715003294e-07, + "loss": 0.0214, + "reward": 1.1540178954601288, + "reward_std": 0.1556169968098402, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9843750298023224, + "step": 1245 + }, + { + "completion_length": 1109.1897735595703, + "epoch": 0.3721902770517512, + "grad_norm": 0.8168032765388489, + "kl": 0.3173828125, + "learning_rate": 8.116855744762544e-07, + "loss": 0.0253, + "reward": 1.1830357760190964, + "reward_std": 0.20959574356675148, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9598214775323868, + "step": 1246 + }, + { + "completion_length": 1103.7076416015625, + "epoch": 0.37248898513927264, + "grad_norm": 0.6885372400283813, + "kl": 0.269775390625, + "learning_rate": 8.113035927639226e-07, + "loss": 0.0292, + "reward": 1.1473214626312256, + "reward_std": 0.2022860236465931, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9776785969734192, + "step": 1247 + }, + { + "completion_length": 1020.4487152099609, + "epoch": 0.3727876932267941, + "grad_norm": 0.5656836032867432, + "kl": 0.265380859375, + "learning_rate": 8.109213267788921e-07, + "loss": 0.0265, + "reward": 1.162946492433548, + "reward_std": 0.21654951758682728, + "rewards/accuracy_reward": 0.18526786798611283, + "rewards/format_reward": 0.9776785969734192, + "step": 1248 + }, + { + "completion_length": 1135.9732666015625, + "epoch": 0.3730864013143156, + "grad_norm": 0.5383604764938354, + "kl": 0.2470703125, + "learning_rate": 8.105387769370312e-07, + "loss": 0.0213, + "reward": 1.1651786267757416, + "reward_std": 0.18523560836911201, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.9776785969734192, + "step": 1249 + }, + { + "completion_length": 1121.7522735595703, + "epoch": 0.37338510940183706, + "grad_norm": 0.7931256294250488, + "kl": 0.34033203125, + "learning_rate": 8.101559436545165e-07, + "loss": 0.0352, + "reward": 1.1361607313156128, + "reward_std": 0.22279032692313194, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9687500596046448, + "step": 1250 + }, + { + "completion_length": 1155.8125915527344, + "epoch": 0.3736838174893585, + "grad_norm": 0.5801315307617188, + "kl": 0.34716796875, + "learning_rate": 8.097728273478332e-07, + "loss": 0.0495, + "reward": 1.0803571939468384, + "reward_std": 0.21098225563764572, + "rewards/accuracy_reward": 0.12276785913854837, + "rewards/format_reward": 0.9575893133878708, + "step": 1251 + }, + { + "completion_length": 1108.1473846435547, + "epoch": 0.37398252557688, + "grad_norm": 1.1289706230163574, + "kl": 0.317138671875, + "learning_rate": 8.093894284337742e-07, + "loss": 0.0402, + "reward": 1.1629464626312256, + "reward_std": 0.24364111572504044, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9598214626312256, + "step": 1252 + }, + { + "completion_length": 1043.0469055175781, + "epoch": 0.37428123366440147, + "grad_norm": 0.42773082852363586, + "kl": 0.242431640625, + "learning_rate": 8.090057473294398e-07, + "loss": 0.0253, + "reward": 1.0959821939468384, + "reward_std": 0.14776318706572056, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.9821428805589676, + "step": 1253 + }, + { + "completion_length": 1082.3348846435547, + "epoch": 0.37457994175192294, + "grad_norm": 1.07021963596344, + "kl": 0.30126953125, + "learning_rate": 8.086217844522377e-07, + "loss": 0.044, + "reward": 1.1674107611179352, + "reward_std": 0.19510815106332302, + "rewards/accuracy_reward": 0.18973215413279831, + "rewards/format_reward": 0.9776786267757416, + "step": 1254 + }, + { + "completion_length": 1098.0603332519531, + "epoch": 0.3748786498394444, + "grad_norm": 0.9351935982704163, + "kl": 0.336669921875, + "learning_rate": 8.082375402198819e-07, + "loss": 0.0529, + "reward": 1.0781250894069672, + "reward_std": 0.19594995491206646, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.9687500447034836, + "step": 1255 + }, + { + "completion_length": 1148.3750305175781, + "epoch": 0.3751773579269659, + "grad_norm": 0.9945690631866455, + "kl": 0.45703125, + "learning_rate": 8.078530150503923e-07, + "loss": 0.0456, + "reward": 1.0892857909202576, + "reward_std": 0.21793914213776588, + "rewards/accuracy_reward": 0.12723214668221772, + "rewards/format_reward": 0.9620535969734192, + "step": 1256 + }, + { + "completion_length": 1042.1629791259766, + "epoch": 0.37547606601448735, + "grad_norm": 0.6425369381904602, + "kl": 0.484375, + "learning_rate": 8.074682093620946e-07, + "loss": 0.0366, + "reward": 1.037946492433548, + "reward_std": 0.18364215455949306, + "rewards/accuracy_reward": 0.06473214598372579, + "rewards/format_reward": 0.973214328289032, + "step": 1257 + }, + { + "completion_length": 971.9063110351562, + "epoch": 0.3757747741020088, + "grad_norm": 1.1351476907730103, + "kl": 0.49853515625, + "learning_rate": 8.070831235736197e-07, + "loss": 0.0303, + "reward": 1.129464328289032, + "reward_std": 0.13866574503481388, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.9799107611179352, + "step": 1258 + }, + { + "completion_length": 1159.1138916015625, + "epoch": 0.3760734821895303, + "grad_norm": 1.4718488454818726, + "kl": 0.814453125, + "learning_rate": 8.066977581039033e-07, + "loss": 0.052, + "reward": 1.1049107909202576, + "reward_std": 0.23015449941158295, + "rewards/accuracy_reward": 0.14508929662406445, + "rewards/format_reward": 0.9598214626312256, + "step": 1259 + }, + { + "completion_length": 1011.0335235595703, + "epoch": 0.37637219027705177, + "grad_norm": 1.1877079010009766, + "kl": 0.787109375, + "learning_rate": 8.063121133721849e-07, + "loss": 0.0466, + "reward": 1.1674107611179352, + "reward_std": 0.20235652476549149, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.973214328289032, + "step": 1260 + }, + { + "completion_length": 1013.2143249511719, + "epoch": 0.37667089836457324, + "grad_norm": 0.8397384881973267, + "kl": 0.6748046875, + "learning_rate": 8.059261897980086e-07, + "loss": 0.0724, + "reward": 1.113839328289032, + "reward_std": 0.22130518034100533, + "rewards/accuracy_reward": 0.14508928824216127, + "rewards/format_reward": 0.9687500447034836, + "step": 1261 + }, + { + "completion_length": 977.0045166015625, + "epoch": 0.3769696064520947, + "grad_norm": 1.3130906820297241, + "kl": 0.64599609375, + "learning_rate": 8.055399878012214e-07, + "loss": 0.0655, + "reward": 1.1406250447034836, + "reward_std": 0.21624304354190826, + "rewards/accuracy_reward": 0.18303572502918541, + "rewards/format_reward": 0.957589328289032, + "step": 1262 + }, + { + "completion_length": 1048.7879791259766, + "epoch": 0.3772683145396162, + "grad_norm": 0.8614626526832581, + "kl": 0.68359375, + "learning_rate": 8.051535078019729e-07, + "loss": 0.0689, + "reward": 1.1562500596046448, + "reward_std": 0.27282898128032684, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9598214626312256, + "step": 1263 + }, + { + "completion_length": 991.8772735595703, + "epoch": 0.37756702262713765, + "grad_norm": 0.8286159038543701, + "kl": 0.689453125, + "learning_rate": 8.047667502207157e-07, + "loss": 0.0627, + "reward": 1.1517857611179352, + "reward_std": 0.2995161712169647, + "rewards/accuracy_reward": 0.19196429662406445, + "rewards/format_reward": 0.9598214775323868, + "step": 1264 + }, + { + "completion_length": 1140.4866638183594, + "epoch": 0.3778657307146591, + "grad_norm": 0.858707845211029, + "kl": 0.64404296875, + "learning_rate": 8.043797154782041e-07, + "loss": 0.0568, + "reward": 1.0714286267757416, + "reward_std": 0.17480899393558502, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.9687500447034836, + "step": 1265 + }, + { + "completion_length": 1038.9085388183594, + "epoch": 0.3781644388021806, + "grad_norm": 0.5580734610557556, + "kl": 0.470703125, + "learning_rate": 8.039924039954939e-07, + "loss": 0.0502, + "reward": 1.1450893580913544, + "reward_std": 0.30700792372226715, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/format_reward": 0.957589328289032, + "step": 1266 + }, + { + "completion_length": 1033.6652221679688, + "epoch": 0.37846314688970206, + "grad_norm": 1.1960697174072266, + "kl": 0.35400390625, + "learning_rate": 8.036048161939422e-07, + "loss": 0.0631, + "reward": 1.1316964626312256, + "reward_std": 0.21333079412579536, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9620536118745804, + "step": 1267 + }, + { + "completion_length": 995.6652221679688, + "epoch": 0.37876185497722353, + "grad_norm": 1.2353492975234985, + "kl": 0.39990234375, + "learning_rate": 8.032169524952062e-07, + "loss": 0.069, + "reward": 1.1897321939468384, + "reward_std": 0.21015794575214386, + "rewards/accuracy_reward": 0.2098214402794838, + "rewards/format_reward": 0.979910746216774, + "step": 1268 + }, + { + "completion_length": 1114.9353485107422, + "epoch": 0.379060563064745, + "grad_norm": 0.9831238389015198, + "kl": 0.64501953125, + "learning_rate": 8.028288133212441e-07, + "loss": 0.037, + "reward": 1.082589328289032, + "reward_std": 0.2644584849476814, + "rewards/accuracy_reward": 0.12723215157166123, + "rewards/format_reward": 0.9553571790456772, + "step": 1269 + }, + { + "completion_length": 1171.43310546875, + "epoch": 0.3793592711522665, + "grad_norm": 1.0195324420928955, + "kl": 0.5341796875, + "learning_rate": 8.024403990943128e-07, + "loss": 0.0944, + "reward": 1.1517857611179352, + "reward_std": 0.26615507528185844, + "rewards/accuracy_reward": 0.20089286309666932, + "rewards/format_reward": 0.95089291036129, + "step": 1270 + }, + { + "completion_length": 992.5937957763672, + "epoch": 0.3796579792397879, + "grad_norm": 1.365082859992981, + "kl": 0.39794921875, + "learning_rate": 8.020517102369692e-07, + "loss": 0.0668, + "reward": 1.131696492433548, + "reward_std": 0.2389909364283085, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.95089291036129, + "step": 1271 + }, + { + "completion_length": 1041.185317993164, + "epoch": 0.37995668732730936, + "grad_norm": 1.3154394626617432, + "kl": 0.43359375, + "learning_rate": 8.016627471720684e-07, + "loss": 0.0755, + "reward": 1.1428571939468384, + "reward_std": 0.2835914231836796, + "rewards/accuracy_reward": 0.18526786379516125, + "rewards/format_reward": 0.957589328289032, + "step": 1272 + }, + { + "completion_length": 1040.3616333007812, + "epoch": 0.38025539541483083, + "grad_norm": 0.791388988494873, + "kl": 0.37841796875, + "learning_rate": 8.012735103227644e-07, + "loss": 0.0616, + "reward": 1.2008928954601288, + "reward_std": 0.22147461026906967, + "rewards/accuracy_reward": 0.2254464440047741, + "rewards/format_reward": 0.9754464626312256, + "step": 1273 + }, + { + "completion_length": 986.3951263427734, + "epoch": 0.3805541035023523, + "grad_norm": 0.5291969776153564, + "kl": 0.49609375, + "learning_rate": 8.008840001125088e-07, + "loss": 0.097, + "reward": 1.1718750596046448, + "reward_std": 0.22515198215842247, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.98214291036129, + "step": 1274 + }, + { + "completion_length": 1036.3973541259766, + "epoch": 0.3808528115898738, + "grad_norm": 1.221097707748413, + "kl": 0.5576171875, + "learning_rate": 8.004942169650501e-07, + "loss": 0.0331, + "reward": 1.1316964775323868, + "reward_std": 0.21248343214392662, + "rewards/accuracy_reward": 0.16964286309666932, + "rewards/format_reward": 0.9620536118745804, + "step": 1275 + }, + { + "completion_length": 1022.2031707763672, + "epoch": 0.38115151967739525, + "grad_norm": 1.313026785850525, + "kl": 0.6953125, + "learning_rate": 8.001041613044346e-07, + "loss": 0.0734, + "reward": 1.1294643580913544, + "reward_std": 0.21225593611598015, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9709821939468384, + "step": 1276 + }, + { + "completion_length": 1100.0826263427734, + "epoch": 0.3814502277649167, + "grad_norm": 1.603394865989685, + "kl": 0.67822265625, + "learning_rate": 7.997138335550043e-07, + "loss": 0.0389, + "reward": 1.12276791036129, + "reward_std": 0.15357945673167706, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.9620536267757416, + "step": 1277 + }, + { + "completion_length": 972.3371124267578, + "epoch": 0.3817489358524382, + "grad_norm": 1.5111039876937866, + "kl": 0.49365234375, + "learning_rate": 7.993232341413977e-07, + "loss": 0.1153, + "reward": 1.2433035969734192, + "reward_std": 0.2553333044052124, + "rewards/accuracy_reward": 0.2790178656578064, + "rewards/format_reward": 0.964285746216774, + "step": 1278 + }, + { + "completion_length": 1064.9866638183594, + "epoch": 0.38204764393995966, + "grad_norm": 1.0292437076568604, + "kl": 0.52978515625, + "learning_rate": 7.989323634885488e-07, + "loss": 0.0377, + "reward": 1.1495536267757416, + "reward_std": 0.19258035346865654, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9776785969734192, + "step": 1279 + }, + { + "completion_length": 939.1495971679688, + "epoch": 0.38234635202748113, + "grad_norm": 0.7840143442153931, + "kl": 0.56787109375, + "learning_rate": 7.985412220216861e-07, + "loss": 0.0251, + "reward": 1.120535746216774, + "reward_std": 0.2440393976867199, + "rewards/accuracy_reward": 0.15848215110599995, + "rewards/format_reward": 0.9620536267757416, + "step": 1280 + }, + { + "completion_length": 1090.8929138183594, + "epoch": 0.3826450601150026, + "grad_norm": 0.7385126948356628, + "kl": 0.53173828125, + "learning_rate": 7.981498101663337e-07, + "loss": 0.0397, + "reward": 1.0647321939468384, + "reward_std": 0.21821128949522972, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.9687500447034836, + "step": 1281 + }, + { + "completion_length": 1079.5647888183594, + "epoch": 0.38294376820252407, + "grad_norm": 0.8661399483680725, + "kl": 0.6572265625, + "learning_rate": 7.977581283483091e-07, + "loss": 0.0701, + "reward": 1.1852678954601288, + "reward_std": 0.21739915758371353, + "rewards/accuracy_reward": 0.2098214402794838, + "rewards/format_reward": 0.9754464626312256, + "step": 1282 + }, + { + "completion_length": 1062.8817443847656, + "epoch": 0.38324247629004554, + "grad_norm": 0.9968939423561096, + "kl": 0.6962890625, + "learning_rate": 7.973661769937239e-07, + "loss": 0.0726, + "reward": 1.1138393431901932, + "reward_std": 0.22483371198177338, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9531250298023224, + "step": 1283 + }, + { + "completion_length": 1141.3393249511719, + "epoch": 0.383541184377567, + "grad_norm": 0.8679356575012207, + "kl": 0.3662109375, + "learning_rate": 7.969739565289826e-07, + "loss": 0.0211, + "reward": 1.1383929252624512, + "reward_std": 0.16549287363886833, + "rewards/accuracy_reward": 0.16071429662406445, + "rewards/format_reward": 0.9776786118745804, + "step": 1284 + }, + { + "completion_length": 1000.6629943847656, + "epoch": 0.3838398924650885, + "grad_norm": 0.4585608243942261, + "kl": 0.4189453125, + "learning_rate": 7.965814673807825e-07, + "loss": 0.0347, + "reward": 1.1339286267757416, + "reward_std": 0.17420705780386925, + "rewards/accuracy_reward": 0.14508929336443543, + "rewards/format_reward": 0.9888393133878708, + "step": 1285 + }, + { + "completion_length": 1068.3638916015625, + "epoch": 0.38413860055260995, + "grad_norm": 0.8922024369239807, + "kl": 0.3291015625, + "learning_rate": 7.961887099761136e-07, + "loss": 0.0325, + "reward": 1.0334821939468384, + "reward_std": 0.14689771179109812, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.98214291036129, + "step": 1286 + }, + { + "completion_length": 1102.7835235595703, + "epoch": 0.3844373086401314, + "grad_norm": 0.4300904870033264, + "kl": 0.383056640625, + "learning_rate": 7.957956847422572e-07, + "loss": 0.0375, + "reward": 1.0848214626312256, + "reward_std": 0.17089252918958664, + "rewards/accuracy_reward": 0.10937500698491931, + "rewards/format_reward": 0.9754464775323868, + "step": 1287 + }, + { + "completion_length": 1097.4754943847656, + "epoch": 0.3847360167276529, + "grad_norm": 0.7403652667999268, + "kl": 0.38818359375, + "learning_rate": 7.954023921067865e-07, + "loss": 0.0304, + "reward": 1.254464328289032, + "reward_std": 0.20513079687952995, + "rewards/accuracy_reward": 0.2767857238650322, + "rewards/format_reward": 0.9776786118745804, + "step": 1288 + }, + { + "completion_length": 1056.9844207763672, + "epoch": 0.38503472481517437, + "grad_norm": 0.5856925249099731, + "kl": 0.3876953125, + "learning_rate": 7.95008832497565e-07, + "loss": -0.0013, + "reward": 1.1049107611179352, + "reward_std": 0.2258751392364502, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9709821939468384, + "step": 1289 + }, + { + "completion_length": 1122.9420166015625, + "epoch": 0.38533343290269584, + "grad_norm": 0.5180642008781433, + "kl": 0.300048828125, + "learning_rate": 7.946150063427473e-07, + "loss": -0.0035, + "reward": 1.0825893580913544, + "reward_std": 0.11555210407823324, + "rewards/accuracy_reward": 0.09598214598372579, + "rewards/format_reward": 0.9866071939468384, + "step": 1290 + }, + { + "completion_length": 1101.9375457763672, + "epoch": 0.3856321409902173, + "grad_norm": 0.5688227415084839, + "kl": 0.29296875, + "learning_rate": 7.942209140707777e-07, + "loss": 0.0217, + "reward": 1.0803571939468384, + "reward_std": 0.1771563682705164, + "rewards/accuracy_reward": 0.09151786146685481, + "rewards/format_reward": 0.988839328289032, + "step": 1291 + }, + { + "completion_length": 1132.8036193847656, + "epoch": 0.3859308490777388, + "grad_norm": 1.0375252962112427, + "kl": 0.3330078125, + "learning_rate": 7.938265561103897e-07, + "loss": 0.0259, + "reward": 1.2142857313156128, + "reward_std": 0.1985662542283535, + "rewards/accuracy_reward": 0.2366071492433548, + "rewards/format_reward": 0.9776786118745804, + "step": 1292 + }, + { + "completion_length": 1157.8371276855469, + "epoch": 0.38622955716526025, + "grad_norm": 0.39911314845085144, + "kl": 0.265380859375, + "learning_rate": 7.934319328906061e-07, + "loss": -0.0034, + "reward": 1.0625000596046448, + "reward_std": 0.11368322186172009, + "rewards/accuracy_reward": 0.07366071618162096, + "rewards/format_reward": 0.9888393133878708, + "step": 1293 + }, + { + "completion_length": 1166.6652221679688, + "epoch": 0.3865282652527817, + "grad_norm": 0.6207835674285889, + "kl": 0.36279296875, + "learning_rate": 7.930370448407386e-07, + "loss": 0.0011, + "reward": 1.147321492433548, + "reward_std": 0.2282610535621643, + "rewards/accuracy_reward": 0.17633929569274187, + "rewards/format_reward": 0.9709821790456772, + "step": 1294 + }, + { + "completion_length": 1151.62060546875, + "epoch": 0.3868269733403032, + "grad_norm": 1.12401282787323, + "kl": 0.35791015625, + "learning_rate": 7.926418923903863e-07, + "loss": 0.0263, + "reward": 1.2120535969734192, + "reward_std": 0.26260218769311905, + "rewards/accuracy_reward": 0.24776786053553224, + "rewards/format_reward": 0.9642857611179352, + "step": 1295 + }, + { + "completion_length": 1160.7857971191406, + "epoch": 0.38712568142782466, + "grad_norm": 0.9750357270240784, + "kl": 0.29638671875, + "learning_rate": 7.922464759694369e-07, + "loss": 0.031, + "reward": 1.1250000596046448, + "reward_std": 0.20960838720202446, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.973214328289032, + "step": 1296 + }, + { + "completion_length": 1114.857192993164, + "epoch": 0.38742438951534613, + "grad_norm": 0.7155897617340088, + "kl": 0.375, + "learning_rate": 7.918507960080641e-07, + "loss": 0.0303, + "reward": 1.147321492433548, + "reward_std": 0.22248389944434166, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9754464626312256, + "step": 1297 + }, + { + "completion_length": 1134.46435546875, + "epoch": 0.3877230976028676, + "grad_norm": 0.8970679044723511, + "kl": 0.4716796875, + "learning_rate": 7.914548529367291e-07, + "loss": 0.0573, + "reward": 1.1049107611179352, + "reward_std": 0.26300855726003647, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.9665178805589676, + "step": 1298 + }, + { + "completion_length": 1120.8728332519531, + "epoch": 0.3880218056903891, + "grad_norm": 1.0191277265548706, + "kl": 0.50146484375, + "learning_rate": 7.910586471861793e-07, + "loss": 0.0497, + "reward": 1.116071492433548, + "reward_std": 0.17857437208294868, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.9776786118745804, + "step": 1299 + }, + { + "completion_length": 1042.8638916015625, + "epoch": 0.38832051377791055, + "grad_norm": 1.4710719585418701, + "kl": 0.5810546875, + "learning_rate": 7.906621791874477e-07, + "loss": 0.0392, + "reward": 1.1272321939468384, + "reward_std": 0.22637182101607323, + "rewards/accuracy_reward": 0.14955357694998384, + "rewards/format_reward": 0.9776786118745804, + "step": 1300 + }, + { + "completion_length": 1055.2522583007812, + "epoch": 0.388619221865432, + "grad_norm": 2.1030619144439697, + "kl": 0.626953125, + "learning_rate": 7.902654493718525e-07, + "loss": 0.0467, + "reward": 1.1808036267757416, + "reward_std": 0.2385573424398899, + "rewards/accuracy_reward": 0.21651786798611283, + "rewards/format_reward": 0.964285746216774, + "step": 1301 + }, + { + "completion_length": 1127.1965026855469, + "epoch": 0.3889179299529535, + "grad_norm": 1.1014089584350586, + "kl": 0.6162109375, + "learning_rate": 7.898684581709969e-07, + "loss": 0.0343, + "reward": 1.1361607611179352, + "reward_std": 0.2137689245864749, + "rewards/accuracy_reward": 0.16741071734577417, + "rewards/format_reward": 0.9687500447034836, + "step": 1302 + }, + { + "completion_length": 1106.9107360839844, + "epoch": 0.38921663804047496, + "grad_norm": 0.7384184002876282, + "kl": 0.47314453125, + "learning_rate": 7.894712060167686e-07, + "loss": 0.0399, + "reward": 1.0602679252624512, + "reward_std": 0.17597635462880135, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.9776786118745804, + "step": 1303 + }, + { + "completion_length": 1070.685317993164, + "epoch": 0.38951534612799643, + "grad_norm": 1.0445003509521484, + "kl": 0.494140625, + "learning_rate": 7.890736933413388e-07, + "loss": 0.0295, + "reward": 1.0491071939468384, + "reward_std": 0.1368128228932619, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.9732143431901932, + "step": 1304 + }, + { + "completion_length": 1071.4955596923828, + "epoch": 0.3898140542155179, + "grad_norm": 0.991575300693512, + "kl": 0.501953125, + "learning_rate": 7.886759205771624e-07, + "loss": 0.0446, + "reward": 1.0892857313156128, + "reward_std": 0.19699675776064396, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.9709821939468384, + "step": 1305 + }, + { + "completion_length": 1072.8705749511719, + "epoch": 0.3901127623030394, + "grad_norm": 0.6434163451194763, + "kl": 0.4833984375, + "learning_rate": 7.882778881569769e-07, + "loss": 0.0269, + "reward": 1.1450893580913544, + "reward_std": 0.2272641323506832, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.964285746216774, + "step": 1306 + }, + { + "completion_length": 992.9821929931641, + "epoch": 0.39041147039056084, + "grad_norm": 0.5257969498634338, + "kl": 0.35205078125, + "learning_rate": 7.878795965138032e-07, + "loss": -0.0274, + "reward": 1.142857164144516, + "reward_std": 0.1554514579474926, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9799107611179352, + "step": 1307 + }, + { + "completion_length": 1047.7500457763672, + "epoch": 0.3907101784780823, + "grad_norm": 1.3440096378326416, + "kl": 0.41748046875, + "learning_rate": 7.874810460809429e-07, + "loss": 0.0723, + "reward": 1.084821492433548, + "reward_std": 0.23954516276717186, + "rewards/accuracy_reward": 0.11830357555299997, + "rewards/format_reward": 0.96651791036129, + "step": 1308 + }, + { + "completion_length": 1049.093765258789, + "epoch": 0.3910088865656038, + "grad_norm": 0.9513984322547913, + "kl": 0.470703125, + "learning_rate": 7.870822372919802e-07, + "loss": 0.0464, + "reward": 1.1183036267757416, + "reward_std": 0.2474626563489437, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.9508928954601288, + "step": 1309 + }, + { + "completion_length": 1063.7031707763672, + "epoch": 0.39130759465312526, + "grad_norm": 0.7787140607833862, + "kl": 0.427734375, + "learning_rate": 7.866831705807801e-07, + "loss": 0.0594, + "reward": 1.113839328289032, + "reward_std": 0.2330658845603466, + "rewards/accuracy_reward": 0.14955358020961285, + "rewards/format_reward": 0.9642857611179352, + "step": 1310 + }, + { + "completion_length": 956.5312957763672, + "epoch": 0.39160630274064673, + "grad_norm": 0.8246996402740479, + "kl": 0.4443359375, + "learning_rate": 7.862838463814876e-07, + "loss": 0.0953, + "reward": 1.0892857611179352, + "reward_std": 0.22432317584753036, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9620536118745804, + "step": 1311 + }, + { + "completion_length": 990.7723693847656, + "epoch": 0.3919050108281682, + "grad_norm": 0.7711923718452454, + "kl": 0.32373046875, + "learning_rate": 7.858842651285286e-07, + "loss": 0.0596, + "reward": 1.1406250298023224, + "reward_std": 0.17030875384807587, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.975446492433548, + "step": 1312 + }, + { + "completion_length": 1078.7857666015625, + "epoch": 0.39220371891568967, + "grad_norm": 1.278863787651062, + "kl": 0.546875, + "learning_rate": 7.854844272566082e-07, + "loss": 0.0627, + "reward": 1.2433036267757416, + "reward_std": 0.2697432152926922, + "rewards/accuracy_reward": 0.2745535783469677, + "rewards/format_reward": 0.9687500298023224, + "step": 1313 + }, + { + "completion_length": 929.3705749511719, + "epoch": 0.3925024270032111, + "grad_norm": 0.8983438014984131, + "kl": 0.35791015625, + "learning_rate": 7.850843332007111e-07, + "loss": 0.0722, + "reward": 1.1741071939468384, + "reward_std": 0.20666301622986794, + "rewards/accuracy_reward": 0.20535715483129025, + "rewards/format_reward": 0.9687500298023224, + "step": 1314 + }, + { + "completion_length": 1141.2857666015625, + "epoch": 0.39280113509073256, + "grad_norm": 0.7191588282585144, + "kl": 0.47705078125, + "learning_rate": 7.846839833961002e-07, + "loss": 0.0388, + "reward": 1.2343750596046448, + "reward_std": 0.2255326770246029, + "rewards/accuracy_reward": 0.2566964440047741, + "rewards/format_reward": 0.9776786118745804, + "step": 1315 + }, + { + "completion_length": 998.5826110839844, + "epoch": 0.393099843178254, + "grad_norm": 0.9330716729164124, + "kl": 0.595703125, + "learning_rate": 7.842833782783167e-07, + "loss": 0.0981, + "reward": 1.1808036267757416, + "reward_std": 0.25120165944099426, + "rewards/accuracy_reward": 0.2098214402794838, + "rewards/format_reward": 0.9709821939468384, + "step": 1316 + }, + { + "completion_length": 1073.0670166015625, + "epoch": 0.3933985512657755, + "grad_norm": 0.8512095808982849, + "kl": 0.62060546875, + "learning_rate": 7.8388251828318e-07, + "loss": 0.1125, + "reward": 1.0401786118745804, + "reward_std": 0.25947191566228867, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.95089291036129, + "step": 1317 + }, + { + "completion_length": 980.2143096923828, + "epoch": 0.39369725935329697, + "grad_norm": 1.7475277185440063, + "kl": 0.7265625, + "learning_rate": 7.834814038467864e-07, + "loss": 0.1405, + "reward": 1.0714286267757416, + "reward_std": 0.28874142467975616, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9263393133878708, + "step": 1318 + }, + { + "completion_length": 964.2679138183594, + "epoch": 0.39399596744081844, + "grad_norm": 0.9980912804603577, + "kl": 0.595703125, + "learning_rate": 7.830800354055088e-07, + "loss": 0.0762, + "reward": 1.0937500596046448, + "reward_std": 0.14433999173343182, + "rewards/accuracy_reward": 0.13169643585570157, + "rewards/format_reward": 0.9620536267757416, + "step": 1319 + }, + { + "completion_length": 929.6830749511719, + "epoch": 0.3942946755283399, + "grad_norm": 1.1509368419647217, + "kl": 0.64599609375, + "learning_rate": 7.826784133959972e-07, + "loss": 0.084, + "reward": 1.1004464626312256, + "reward_std": 0.17759577743709087, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9687500447034836, + "step": 1320 + }, + { + "completion_length": 1088.1049499511719, + "epoch": 0.3945933836158614, + "grad_norm": 28.767337799072266, + "kl": 1.7421875, + "learning_rate": 7.822765382551768e-07, + "loss": 0.2538, + "reward": 1.0245535969734192, + "reward_std": 0.2989867031574249, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.9285714626312256, + "step": 1321 + }, + { + "completion_length": 1003.4888916015625, + "epoch": 0.39489209170338285, + "grad_norm": 11.950621604919434, + "kl": 1.2001953125, + "learning_rate": 7.818744104202483e-07, + "loss": 0.1716, + "reward": 1.0758929252624512, + "reward_std": 0.26266369223594666, + "rewards/accuracy_reward": 0.12053571920841932, + "rewards/format_reward": 0.9553571790456772, + "step": 1322 + }, + { + "completion_length": 908.2098693847656, + "epoch": 0.3951907997909043, + "grad_norm": 1.6285487413406372, + "kl": 0.82421875, + "learning_rate": 7.814720303286871e-07, + "loss": 0.0962, + "reward": 1.080357164144516, + "reward_std": 0.22292851842939854, + "rewards/accuracy_reward": 0.12053571688011289, + "rewards/format_reward": 0.9598214775323868, + "step": 1323 + }, + { + "completion_length": 922.1986999511719, + "epoch": 0.3954895078784258, + "grad_norm": 2.0484278202056885, + "kl": 0.9912109375, + "learning_rate": 7.810693984182439e-07, + "loss": 0.111, + "reward": 1.109375074505806, + "reward_std": 0.23729100078344345, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.9441964775323868, + "step": 1324 + }, + { + "completion_length": 934.7879791259766, + "epoch": 0.39578821596594727, + "grad_norm": 2.171124219894409, + "kl": 0.8125, + "learning_rate": 7.806665151269424e-07, + "loss": 0.0899, + "reward": 1.145089328289032, + "reward_std": 0.21125439181923866, + "rewards/accuracy_reward": 0.18973215483129025, + "rewards/format_reward": 0.9553571939468384, + "step": 1325 + }, + { + "completion_length": 940.0491485595703, + "epoch": 0.39608692405346874, + "grad_norm": 4.277523040771484, + "kl": 1.0107421875, + "learning_rate": 7.802633808930802e-07, + "loss": 0.0707, + "reward": 1.082589328289032, + "reward_std": 0.3260234668850899, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9151786118745804, + "step": 1326 + }, + { + "completion_length": 925.8504791259766, + "epoch": 0.3963856321409902, + "grad_norm": 1.3710201978683472, + "kl": 0.8154296875, + "learning_rate": 7.798599961552277e-07, + "loss": 0.0795, + "reward": 1.1116071939468384, + "reward_std": 0.24667997658252716, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.957589328289032, + "step": 1327 + }, + { + "completion_length": 958.9955902099609, + "epoch": 0.3966843402285117, + "grad_norm": 1.3479605913162231, + "kl": 0.7568359375, + "learning_rate": 7.794563613522284e-07, + "loss": 0.0501, + "reward": 1.1272322088479996, + "reward_std": 0.22425082325935364, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9375000447034836, + "step": 1328 + }, + { + "completion_length": 983.3549652099609, + "epoch": 0.39698304831603315, + "grad_norm": 1.0461935997009277, + "kl": 0.69580078125, + "learning_rate": 7.790524769231968e-07, + "loss": 0.0812, + "reward": 1.2098214626312256, + "reward_std": 0.296480517834425, + "rewards/accuracy_reward": 0.258928582072258, + "rewards/format_reward": 0.9508928954601288, + "step": 1329 + }, + { + "completion_length": 959.0134429931641, + "epoch": 0.3972817564035546, + "grad_norm": 1.9589747190475464, + "kl": 0.71484375, + "learning_rate": 7.786483433075199e-07, + "loss": 0.0675, + "reward": 1.0245536267757416, + "reward_std": 0.25333914160728455, + "rewards/accuracy_reward": 0.08482143469154835, + "rewards/format_reward": 0.9397321790456772, + "step": 1330 + }, + { + "completion_length": 926.0491638183594, + "epoch": 0.3975804644910761, + "grad_norm": 0.7238629460334778, + "kl": 0.36962890625, + "learning_rate": 7.782439609448555e-07, + "loss": 0.0064, + "reward": 1.2053572237491608, + "reward_std": 0.14553620386868715, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9821428805589676, + "step": 1331 + }, + { + "completion_length": 936.8348541259766, + "epoch": 0.39787917257859756, + "grad_norm": 1.3971912860870361, + "kl": 0.5, + "learning_rate": 7.778393302751318e-07, + "loss": 0.1, + "reward": 1.1294643580913544, + "reward_std": 0.21799474582076073, + "rewards/accuracy_reward": 0.17857143469154835, + "rewards/format_reward": 0.95089291036129, + "step": 1332 + }, + { + "completion_length": 1054.997817993164, + "epoch": 0.39817788066611903, + "grad_norm": 1.6904025077819824, + "kl": 0.4775390625, + "learning_rate": 7.774344517385476e-07, + "loss": 0.1041, + "reward": 1.1183036267757416, + "reward_std": 0.2579118348658085, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.9620536118745804, + "step": 1333 + }, + { + "completion_length": 1025.9888763427734, + "epoch": 0.3984765887536405, + "grad_norm": 1.521178960800171, + "kl": 0.5322265625, + "learning_rate": 7.770293257755707e-07, + "loss": 0.0761, + "reward": 1.1651786267757416, + "reward_std": 0.2832630090415478, + "rewards/accuracy_reward": 0.216517873108387, + "rewards/format_reward": 0.9486607611179352, + "step": 1334 + }, + { + "completion_length": 1009.6942138671875, + "epoch": 0.398775296841162, + "grad_norm": 1.802173376083374, + "kl": 0.47119140625, + "learning_rate": 7.766239528269387e-07, + "loss": 0.0632, + "reward": 1.1674107909202576, + "reward_std": 0.22983519174158573, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.9598214626312256, + "step": 1335 + }, + { + "completion_length": 1022.8973846435547, + "epoch": 0.39907400492868345, + "grad_norm": 7.020371437072754, + "kl": 0.99951171875, + "learning_rate": 7.762183333336576e-07, + "loss": 0.1134, + "reward": 1.1875000596046448, + "reward_std": 0.28141115605831146, + "rewards/accuracy_reward": 0.2455357275903225, + "rewards/format_reward": 0.941964328289032, + "step": 1336 + }, + { + "completion_length": 1062.6719207763672, + "epoch": 0.3993727130162049, + "grad_norm": 0.7418414950370789, + "kl": 0.55029296875, + "learning_rate": 7.758124677370014e-07, + "loss": 0.0741, + "reward": 1.158482164144516, + "reward_std": 0.22791395895183086, + "rewards/accuracy_reward": 0.2031250149011612, + "rewards/format_reward": 0.9553571939468384, + "step": 1337 + }, + { + "completion_length": 935.4777221679688, + "epoch": 0.3996714211037264, + "grad_norm": 0.9110551476478577, + "kl": 0.32568359375, + "learning_rate": 7.754063564785125e-07, + "loss": 0.052, + "reward": 1.2544643580913544, + "reward_std": 0.26486365869641304, + "rewards/accuracy_reward": 0.28125000931322575, + "rewards/format_reward": 0.9732143133878708, + "step": 1338 + }, + { + "completion_length": 1015.6920318603516, + "epoch": 0.39997012919124786, + "grad_norm": 1.4898358583450317, + "kl": 0.385986328125, + "learning_rate": 7.75e-07, + "loss": 0.0694, + "reward": 1.272321492433548, + "reward_std": 0.2682076543569565, + "rewards/accuracy_reward": 0.3058035895228386, + "rewards/format_reward": 0.96651791036129, + "step": 1339 + }, + { + "completion_length": 1054.138427734375, + "epoch": 0.40026883727876933, + "grad_norm": 0.6653615236282349, + "kl": 0.51220703125, + "learning_rate": 7.745933987435398e-07, + "loss": 0.0558, + "reward": 1.1875000298023224, + "reward_std": 0.20508483052253723, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.9709821790456772, + "step": 1340 + }, + { + "completion_length": 1024.2813262939453, + "epoch": 0.4005675453662908, + "grad_norm": 1.1469025611877441, + "kl": 0.53076171875, + "learning_rate": 7.741865531514743e-07, + "loss": 0.0755, + "reward": 1.1316964626312256, + "reward_std": 0.2525608763098717, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.96651791036129, + "step": 1341 + }, + { + "completion_length": 1023.138427734375, + "epoch": 0.40086625345381227, + "grad_norm": 7.8976054191589355, + "kl": 0.72802734375, + "learning_rate": 7.737794636664116e-07, + "loss": 0.091, + "reward": 1.0758928656578064, + "reward_std": 0.21499651670455933, + "rewards/accuracy_reward": 0.12276786286383867, + "rewards/format_reward": 0.9531250298023224, + "step": 1342 + }, + { + "completion_length": 988.6495971679688, + "epoch": 0.40116496154133374, + "grad_norm": 0.8830287456512451, + "kl": 0.6357421875, + "learning_rate": 7.733721307312251e-07, + "loss": 0.0728, + "reward": 1.0312500447034836, + "reward_std": 0.24606383591890335, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.9508928954601288, + "step": 1343 + }, + { + "completion_length": 1053.5379943847656, + "epoch": 0.4014636696288552, + "grad_norm": 0.9025485515594482, + "kl": 0.576171875, + "learning_rate": 7.729645547890533e-07, + "loss": 0.0609, + "reward": 1.0625000596046448, + "reward_std": 0.2209244854748249, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.9620535969734192, + "step": 1344 + }, + { + "completion_length": 1043.5469360351562, + "epoch": 0.4017623777163767, + "grad_norm": 1.2420052289962769, + "kl": 0.5927734375, + "learning_rate": 7.725567362832986e-07, + "loss": 0.0513, + "reward": 1.0915178954601288, + "reward_std": 0.2807557284832001, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.9598214775323868, + "step": 1345 + }, + { + "completion_length": 965.622802734375, + "epoch": 0.40206108580389815, + "grad_norm": 1.59744131565094, + "kl": 0.7490234375, + "learning_rate": 7.721486756576279e-07, + "loss": 0.0715, + "reward": 1.0625000298023224, + "reward_std": 0.20584012940526009, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.9464286118745804, + "step": 1346 + }, + { + "completion_length": 1028.761215209961, + "epoch": 0.4023597938914196, + "grad_norm": 2.7720534801483154, + "kl": 0.60107421875, + "learning_rate": 7.71740373355971e-07, + "loss": 0.0937, + "reward": 1.0089286267757416, + "reward_std": 0.26576774194836617, + "rewards/accuracy_reward": 0.06696428963914514, + "rewards/format_reward": 0.9419643133878708, + "step": 1347 + }, + { + "completion_length": 985.7388916015625, + "epoch": 0.4026585019789411, + "grad_norm": 1.975714921951294, + "kl": 0.640625, + "learning_rate": 7.713318298225206e-07, + "loss": 0.0646, + "reward": 1.0513393431901932, + "reward_std": 0.22269097343087196, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.948660746216774, + "step": 1348 + }, + { + "completion_length": 980.0312957763672, + "epoch": 0.40295721006646257, + "grad_norm": 1.8778821229934692, + "kl": 0.6103515625, + "learning_rate": 7.709230455017323e-07, + "loss": 0.0924, + "reward": 1.2321429252624512, + "reward_std": 0.259918637573719, + "rewards/accuracy_reward": 0.26785715483129025, + "rewards/format_reward": 0.9642857611179352, + "step": 1349 + }, + { + "completion_length": 928.9442291259766, + "epoch": 0.40325591815398404, + "grad_norm": 1.0072427988052368, + "kl": 0.53369140625, + "learning_rate": 7.705140208383234e-07, + "loss": 0.0348, + "reward": 1.1562500298023224, + "reward_std": 0.15799783542752266, + "rewards/accuracy_reward": 0.18750000861473382, + "rewards/format_reward": 0.9687500447034836, + "step": 1350 + }, + { + "completion_length": 985.1495819091797, + "epoch": 0.4035546262415055, + "grad_norm": 5.136187553405762, + "kl": 0.8662109375, + "learning_rate": 7.701047562772725e-07, + "loss": 0.0527, + "reward": 1.0870535969734192, + "reward_std": 0.276871420443058, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9531250298023224, + "step": 1351 + }, + { + "completion_length": 962.6875457763672, + "epoch": 0.403853334329027, + "grad_norm": 4.30214262008667, + "kl": 0.65771484375, + "learning_rate": 7.696952522638192e-07, + "loss": 0.0508, + "reward": 1.0334821939468384, + "reward_std": 0.16016959864646196, + "rewards/accuracy_reward": 0.058035716181620955, + "rewards/format_reward": 0.9754464626312256, + "step": 1352 + }, + { + "completion_length": 955.1205749511719, + "epoch": 0.40415204241654845, + "grad_norm": 0.9326144456863403, + "kl": 0.56982421875, + "learning_rate": 7.692855092434639e-07, + "loss": 0.0725, + "reward": 1.1093750894069672, + "reward_std": 0.1870501060038805, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9799107611179352, + "step": 1353 + }, + { + "completion_length": 960.3192291259766, + "epoch": 0.4044507505040699, + "grad_norm": 3.5995805263519287, + "kl": 0.66015625, + "learning_rate": 7.68875527661967e-07, + "loss": 0.09, + "reward": 1.1718750298023224, + "reward_std": 0.25762686505913734, + "rewards/accuracy_reward": 0.23214287543669343, + "rewards/format_reward": 0.9397321939468384, + "step": 1354 + }, + { + "completion_length": 917.0625305175781, + "epoch": 0.4047494585915914, + "grad_norm": 0.9510197639465332, + "kl": 0.42041015625, + "learning_rate": 7.684653079653479e-07, + "loss": 0.0354, + "reward": 1.082589328289032, + "reward_std": 0.14126678742468357, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.9776786118745804, + "step": 1355 + }, + { + "completion_length": 969.6250457763672, + "epoch": 0.40504816667911286, + "grad_norm": 0.7304730415344238, + "kl": 0.40576171875, + "learning_rate": 7.680548505998857e-07, + "loss": 0.0213, + "reward": 1.0424107611179352, + "reward_std": 0.19679393619298935, + "rewards/accuracy_reward": 0.07589285867288709, + "rewards/format_reward": 0.96651791036129, + "step": 1356 + }, + { + "completion_length": 916.5000305175781, + "epoch": 0.4053468747666343, + "grad_norm": 1.7145423889160156, + "kl": 0.3994140625, + "learning_rate": 7.676441560121178e-07, + "loss": 0.0571, + "reward": 1.1473214626312256, + "reward_std": 0.19643768295645714, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.979910746216774, + "step": 1357 + }, + { + "completion_length": 957.2120971679688, + "epoch": 0.40564558285415575, + "grad_norm": 1.7887991666793823, + "kl": 0.32373046875, + "learning_rate": 7.672332246488396e-07, + "loss": 0.079, + "reward": 1.1339286267757416, + "reward_std": 0.22911597602069378, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.9665178954601288, + "step": 1358 + }, + { + "completion_length": 960.2812957763672, + "epoch": 0.4059442909416772, + "grad_norm": 1.0364445447921753, + "kl": 0.375, + "learning_rate": 7.66822056957104e-07, + "loss": 0.0483, + "reward": 1.2053572237491608, + "reward_std": 0.20579254999756813, + "rewards/accuracy_reward": 0.23660715483129025, + "rewards/format_reward": 0.9687500447034836, + "step": 1359 + }, + { + "completion_length": 1057.216567993164, + "epoch": 0.4062429990291987, + "grad_norm": 0.8335170149803162, + "kl": 0.448486328125, + "learning_rate": 7.664106533842214e-07, + "loss": 0.0386, + "reward": 1.129464328289032, + "reward_std": 0.19918310269713402, + "rewards/accuracy_reward": 0.1629464402794838, + "rewards/format_reward": 0.9665178805589676, + "step": 1360 + }, + { + "completion_length": 950.7143249511719, + "epoch": 0.40654170711672016, + "grad_norm": 3.901939868927002, + "kl": 0.47314453125, + "learning_rate": 7.659990143777584e-07, + "loss": 0.0966, + "reward": 1.2142857611179352, + "reward_std": 0.20603691413998604, + "rewards/accuracy_reward": 0.24107143841683865, + "rewards/format_reward": 0.9732143431901932, + "step": 1361 + }, + { + "completion_length": 1031.214370727539, + "epoch": 0.40684041520424163, + "grad_norm": 14.366890907287598, + "kl": 1.1396484375, + "learning_rate": 7.655871403855378e-07, + "loss": 0.1113, + "reward": 1.1607143431901932, + "reward_std": 0.24715937301516533, + "rewards/accuracy_reward": 0.20089286798611283, + "rewards/format_reward": 0.9598214775323868, + "step": 1362 + }, + { + "completion_length": 1016.0536041259766, + "epoch": 0.4071391232917631, + "grad_norm": 3.3815062046051025, + "kl": 0.6943359375, + "learning_rate": 7.651750318556384e-07, + "loss": 0.0044, + "reward": 1.1250000298023224, + "reward_std": 0.23987198248505592, + "rewards/accuracy_reward": 0.16964286798611283, + "rewards/format_reward": 0.9553571790456772, + "step": 1363 + }, + { + "completion_length": 973.0424499511719, + "epoch": 0.4074378313792846, + "grad_norm": 86.5916976928711, + "kl": 2.18359375, + "learning_rate": 7.647626892363938e-07, + "loss": 0.2171, + "reward": 1.1272322237491608, + "reward_std": 0.15900097787380219, + "rewards/accuracy_reward": 0.14508929196745157, + "rewards/format_reward": 0.9821428954601288, + "step": 1364 + }, + { + "completion_length": 986.2589874267578, + "epoch": 0.40773653946680605, + "grad_norm": 5.304872989654541, + "kl": 0.78662109375, + "learning_rate": 7.643501129763923e-07, + "loss": 0.0794, + "reward": 1.071428656578064, + "reward_std": 0.22852055728435516, + "rewards/accuracy_reward": 0.10714286426082253, + "rewards/format_reward": 0.964285746216774, + "step": 1365 + }, + { + "completion_length": 988.7210388183594, + "epoch": 0.4080352475543275, + "grad_norm": 0.8833049535751343, + "kl": 0.5380859375, + "learning_rate": 7.639373035244763e-07, + "loss": 0.0428, + "reward": 1.2276785969734192, + "reward_std": 0.26666712388396263, + "rewards/accuracy_reward": 0.26116072945296764, + "rewards/format_reward": 0.9665178954601288, + "step": 1366 + }, + { + "completion_length": 945.6674499511719, + "epoch": 0.408333955641849, + "grad_norm": 1.432510256767273, + "kl": 0.5107421875, + "learning_rate": 7.635242613297423e-07, + "loss": 0.0293, + "reward": 1.238839328289032, + "reward_std": 0.2042093388736248, + "rewards/accuracy_reward": 0.26562501350417733, + "rewards/format_reward": 0.9732143133878708, + "step": 1367 + }, + { + "completion_length": 1086.0982666015625, + "epoch": 0.40863266372937046, + "grad_norm": 6.041935443878174, + "kl": 0.63330078125, + "learning_rate": 7.631109868415397e-07, + "loss": 0.0555, + "reward": 1.0892857611179352, + "reward_std": 0.1755383051931858, + "rewards/accuracy_reward": 0.11160714831203222, + "rewards/format_reward": 0.9776786118745804, + "step": 1368 + }, + { + "completion_length": 1010.1741638183594, + "epoch": 0.40893137181689193, + "grad_norm": 1.4063668251037598, + "kl": 0.5, + "learning_rate": 7.626974805094704e-07, + "loss": 0.0723, + "reward": 1.1562500298023224, + "reward_std": 0.2556280419230461, + "rewards/accuracy_reward": 0.19419643841683865, + "rewards/format_reward": 0.9620536118745804, + "step": 1369 + }, + { + "completion_length": 1011.8393402099609, + "epoch": 0.4092300799044134, + "grad_norm": 1.3684601783752441, + "kl": 0.544921875, + "learning_rate": 7.62283742783389e-07, + "loss": 0.0518, + "reward": 1.1517857313156128, + "reward_std": 0.2837824635207653, + "rewards/accuracy_reward": 0.18973215529695153, + "rewards/format_reward": 0.9620536118745804, + "step": 1370 + }, + { + "completion_length": 1051.4085540771484, + "epoch": 0.4095287879919349, + "grad_norm": 0.9698778390884399, + "kl": 0.49609375, + "learning_rate": 7.618697741134012e-07, + "loss": 0.049, + "reward": 1.0267857611179352, + "reward_std": 0.18238049745559692, + "rewards/accuracy_reward": 0.055803573690354824, + "rewards/format_reward": 0.9709821790456772, + "step": 1371 + }, + { + "completion_length": 963.7857666015625, + "epoch": 0.40982749607945634, + "grad_norm": 0.7104634642601013, + "kl": 0.5087890625, + "learning_rate": 7.61455574949865e-07, + "loss": 0.0511, + "reward": 1.1093750298023224, + "reward_std": 0.19910847768187523, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9754464626312256, + "step": 1372 + }, + { + "completion_length": 1004.04248046875, + "epoch": 0.4101262041669778, + "grad_norm": 3.897771120071411, + "kl": 0.63232421875, + "learning_rate": 7.610411457433878e-07, + "loss": 0.0722, + "reward": 1.1183036267757416, + "reward_std": 0.19654606096446514, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.96651791036129, + "step": 1373 + }, + { + "completion_length": 980.0245971679688, + "epoch": 0.4104249122544993, + "grad_norm": 1.3757824897766113, + "kl": 0.75390625, + "learning_rate": 7.606264869448285e-07, + "loss": 0.0815, + "reward": 1.0982143431901932, + "reward_std": 0.21892092004418373, + "rewards/accuracy_reward": 0.12946429220028222, + "rewards/format_reward": 0.9687500298023224, + "step": 1374 + }, + { + "completion_length": 1095.4263916015625, + "epoch": 0.41072362034202076, + "grad_norm": 0.6989831328392029, + "kl": 0.52490234375, + "learning_rate": 7.602115990052952e-07, + "loss": 0.0598, + "reward": 1.0290178954601288, + "reward_std": 0.2183571755886078, + "rewards/accuracy_reward": 0.06919643189758062, + "rewards/format_reward": 0.9598214775323868, + "step": 1375 + }, + { + "completion_length": 975.5000457763672, + "epoch": 0.4110223284295422, + "grad_norm": 1.5230600833892822, + "kl": 0.6064453125, + "learning_rate": 7.59796482376145e-07, + "loss": 0.0438, + "reward": 1.194196492433548, + "reward_std": 0.23332232981920242, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9620536118745804, + "step": 1376 + }, + { + "completion_length": 886.2232666015625, + "epoch": 0.4113210365170637, + "grad_norm": 1.064803957939148, + "kl": 0.50634765625, + "learning_rate": 7.59381137508984e-07, + "loss": 0.0265, + "reward": 1.1897321939468384, + "reward_std": 0.18760051392018795, + "rewards/accuracy_reward": 0.21875000558793545, + "rewards/format_reward": 0.9709821790456772, + "step": 1377 + }, + { + "completion_length": 959.5402221679688, + "epoch": 0.41161974460458517, + "grad_norm": 1.997105360031128, + "kl": 0.5, + "learning_rate": 7.58965564855667e-07, + "loss": 0.0853, + "reward": 1.1830357611179352, + "reward_std": 0.1886446848511696, + "rewards/accuracy_reward": 0.2098214328289032, + "rewards/format_reward": 0.973214328289032, + "step": 1378 + }, + { + "completion_length": 994.1339569091797, + "epoch": 0.41191845269210664, + "grad_norm": 0.967383086681366, + "kl": 0.47705078125, + "learning_rate": 7.585497648682965e-07, + "loss": 0.0264, + "reward": 1.1852678954601288, + "reward_std": 0.20496477372944355, + "rewards/accuracy_reward": 0.19866072130389512, + "rewards/format_reward": 0.986607164144516, + "step": 1379 + }, + { + "completion_length": 982.263427734375, + "epoch": 0.4122171607796281, + "grad_norm": 1.115004062652588, + "kl": 0.42138671875, + "learning_rate": 7.581337379992218e-07, + "loss": 0.0515, + "reward": 1.0558036267757416, + "reward_std": 0.15935370698571205, + "rewards/accuracy_reward": 0.08705357438884676, + "rewards/format_reward": 0.9687500447034836, + "step": 1380 + }, + { + "completion_length": 919.3370971679688, + "epoch": 0.4125158688671496, + "grad_norm": 0.888556957244873, + "kl": 0.3984375, + "learning_rate": 7.577174847010394e-07, + "loss": 0.0314, + "reward": 1.2142857611179352, + "reward_std": 0.24147436395287514, + "rewards/accuracy_reward": 0.2433035857975483, + "rewards/format_reward": 0.9709822088479996, + "step": 1381 + }, + { + "completion_length": 1083.4308319091797, + "epoch": 0.41281457695467105, + "grad_norm": 2.944892644882202, + "kl": 0.54150390625, + "learning_rate": 7.573010054265921e-07, + "loss": 0.0755, + "reward": 1.1250000298023224, + "reward_std": 0.2413301169872284, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.9598214775323868, + "step": 1382 + }, + { + "completion_length": 981.5558471679688, + "epoch": 0.4131132850421925, + "grad_norm": 2.07818603515625, + "kl": 0.580078125, + "learning_rate": 7.568843006289687e-07, + "loss": 0.0611, + "reward": 1.078125074505806, + "reward_std": 0.23362846858799458, + "rewards/accuracy_reward": 0.12723214644938707, + "rewards/format_reward": 0.9508928954601288, + "step": 1383 + }, + { + "completion_length": 896.3995971679688, + "epoch": 0.413411993129714, + "grad_norm": 1.3593229055404663, + "kl": 0.4814453125, + "learning_rate": 7.564673707615029e-07, + "loss": 0.069, + "reward": 1.1584822237491608, + "reward_std": 0.20681694522500038, + "rewards/accuracy_reward": 0.18973215040750802, + "rewards/format_reward": 0.9687500596046448, + "step": 1384 + }, + { + "completion_length": 996.0000610351562, + "epoch": 0.41371070121723547, + "grad_norm": 1.27206289768219, + "kl": 0.5390625, + "learning_rate": 7.560502162777739e-07, + "loss": 0.0842, + "reward": 1.0825893133878708, + "reward_std": 0.24061980098485947, + "rewards/accuracy_reward": 0.12276786682195961, + "rewards/format_reward": 0.9598214626312256, + "step": 1385 + }, + { + "completion_length": 857.0781555175781, + "epoch": 0.41400940930475694, + "grad_norm": 0.3729884922504425, + "kl": 0.3004150390625, + "learning_rate": 7.556328376316046e-07, + "loss": 0.0104, + "reward": 1.1674107611179352, + "reward_std": 0.14184754714369774, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9910714477300644, + "step": 1386 + }, + { + "completion_length": 952.8080902099609, + "epoch": 0.4143081173922784, + "grad_norm": 0.40186384320259094, + "kl": 0.41796875, + "learning_rate": 7.552152352770622e-07, + "loss": 0.0343, + "reward": 1.0468750596046448, + "reward_std": 0.12473158072680235, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.9776786267757416, + "step": 1387 + }, + { + "completion_length": 897.6652221679688, + "epoch": 0.4146068254797999, + "grad_norm": 1.979200839996338, + "kl": 0.38232421875, + "learning_rate": 7.547974096684569e-07, + "loss": 0.0752, + "reward": 1.1250000447034836, + "reward_std": 0.20196041092276573, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9754464626312256, + "step": 1388 + }, + { + "completion_length": 869.8549652099609, + "epoch": 0.41490553356732135, + "grad_norm": 0.8122649788856506, + "kl": 0.37158203125, + "learning_rate": 7.543793612603423e-07, + "loss": 0.061, + "reward": 1.0870536267757416, + "reward_std": 0.21457655914127827, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9732143431901932, + "step": 1389 + }, + { + "completion_length": 964.6920166015625, + "epoch": 0.4152042416548428, + "grad_norm": 3.068053722381592, + "kl": 0.4423828125, + "learning_rate": 7.53961090507514e-07, + "loss": 0.0844, + "reward": 1.100446492433548, + "reward_std": 0.18852029368281364, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.96651791036129, + "step": 1390 + }, + { + "completion_length": 843.2545013427734, + "epoch": 0.4155029497423643, + "grad_norm": 1.3647485971450806, + "kl": 0.42529296875, + "learning_rate": 7.535425978650095e-07, + "loss": 0.0265, + "reward": 1.0937500298023224, + "reward_std": 0.2029933873564005, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.973214328289032, + "step": 1391 + }, + { + "completion_length": 922.6317443847656, + "epoch": 0.41580165782988576, + "grad_norm": 2.234715700149536, + "kl": 0.677734375, + "learning_rate": 7.531238837881079e-07, + "loss": 0.0744, + "reward": 1.1049107313156128, + "reward_std": 0.2662559971213341, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9598214775323868, + "step": 1392 + }, + { + "completion_length": 1018.5670318603516, + "epoch": 0.41610036591740723, + "grad_norm": 1.8234190940856934, + "kl": 0.80078125, + "learning_rate": 7.52704948732329e-07, + "loss": 0.0802, + "reward": 1.1183036267757416, + "reward_std": 0.21084315329790115, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9464286267757416, + "step": 1393 + }, + { + "completion_length": 953.1607666015625, + "epoch": 0.4163990740049287, + "grad_norm": 2.015012741088867, + "kl": 0.66943359375, + "learning_rate": 7.522857931534331e-07, + "loss": 0.0577, + "reward": 1.100446492433548, + "reward_std": 0.26019976660609245, + "rewards/accuracy_reward": 0.14508929220028222, + "rewards/format_reward": 0.9553571790456772, + "step": 1394 + }, + { + "completion_length": 922.8437805175781, + "epoch": 0.4166977820924502, + "grad_norm": 10.261689186096191, + "kl": 0.75244140625, + "learning_rate": 7.518664175074202e-07, + "loss": 0.0829, + "reward": 1.0446428954601288, + "reward_std": 0.19923672080039978, + "rewards/accuracy_reward": 0.08482142887078226, + "rewards/format_reward": 0.9598214775323868, + "step": 1395 + }, + { + "completion_length": 1045.0558471679688, + "epoch": 0.41699649017997165, + "grad_norm": 5.3390045166015625, + "kl": 0.8466796875, + "learning_rate": 7.514468222505303e-07, + "loss": 0.1051, + "reward": 1.1138393431901932, + "reward_std": 0.2679128684103489, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.9531250447034836, + "step": 1396 + }, + { + "completion_length": 967.1518249511719, + "epoch": 0.4172951982674931, + "grad_norm": 2.8612332344055176, + "kl": 0.546875, + "learning_rate": 7.510270078392417e-07, + "loss": 0.1089, + "reward": 1.0937500298023224, + "reward_std": 0.23323950171470642, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.9553571939468384, + "step": 1397 + }, + { + "completion_length": 916.872802734375, + "epoch": 0.4175939063550146, + "grad_norm": 2.4834952354431152, + "kl": 0.6962890625, + "learning_rate": 7.506069747302712e-07, + "loss": 0.113, + "reward": 0.988839328289032, + "reward_std": 0.21918479353189468, + "rewards/accuracy_reward": 0.04017857415601611, + "rewards/format_reward": 0.948660746216774, + "step": 1398 + }, + { + "completion_length": 944.9889068603516, + "epoch": 0.41789261444253606, + "grad_norm": 1.3055312633514404, + "kl": 0.49365234375, + "learning_rate": 7.501867233805739e-07, + "loss": 0.0964, + "reward": 1.1004464626312256, + "reward_std": 0.21960639581084251, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.9575893431901932, + "step": 1399 + }, + { + "completion_length": 869.7277221679688, + "epoch": 0.4181913225300575, + "grad_norm": 1.2685075998306274, + "kl": 0.57080078125, + "learning_rate": 7.49766254247342e-07, + "loss": 0.0734, + "reward": 1.111607164144516, + "reward_std": 0.21309761330485344, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.9598214626312256, + "step": 1400 + }, + { + "completion_length": 978.5335388183594, + "epoch": 0.41849003061757895, + "grad_norm": 1.6687427759170532, + "kl": 0.70654296875, + "learning_rate": 7.493455677880046e-07, + "loss": 0.089, + "reward": 1.0290178954601288, + "reward_std": 0.2654389292001724, + "rewards/accuracy_reward": 0.08705357369035482, + "rewards/format_reward": 0.9419643431901932, + "step": 1401 + }, + { + "completion_length": 933.7433624267578, + "epoch": 0.4187887387051004, + "grad_norm": 2.3318426609039307, + "kl": 0.92578125, + "learning_rate": 7.489246644602273e-07, + "loss": 0.1011, + "reward": 1.07589291036129, + "reward_std": 0.2861098814755678, + "rewards/accuracy_reward": 0.13616072433069348, + "rewards/format_reward": 0.9397321790456772, + "step": 1402 + }, + { + "completion_length": 945.4732666015625, + "epoch": 0.4190874467926219, + "grad_norm": 3.8201730251312256, + "kl": 0.9677734375, + "learning_rate": 7.485035447219122e-07, + "loss": 0.1109, + "reward": 1.0669643431901932, + "reward_std": 0.24120841920375824, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.9486607611179352, + "step": 1403 + }, + { + "completion_length": 893.6205749511719, + "epoch": 0.41938615488014336, + "grad_norm": 116.37393951416016, + "kl": 2.2744140625, + "learning_rate": 7.480822090311955e-07, + "loss": 0.227, + "reward": 1.1428572237491608, + "reward_std": 0.35109538584947586, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9464286118745804, + "step": 1404 + }, + { + "completion_length": 917.779052734375, + "epoch": 0.41968486296766483, + "grad_norm": 2.908099412918091, + "kl": 1.0185546875, + "learning_rate": 7.476606578464496e-07, + "loss": 0.1208, + "reward": 1.207589328289032, + "reward_std": 0.3176935985684395, + "rewards/accuracy_reward": 0.2566964440047741, + "rewards/format_reward": 0.9508928954601288, + "step": 1405 + }, + { + "completion_length": 944.2299499511719, + "epoch": 0.4199835710551863, + "grad_norm": 1.817046046257019, + "kl": 0.8271484375, + "learning_rate": 7.472388916262809e-07, + "loss": 0.0889, + "reward": 1.1339286267757416, + "reward_std": 0.295966487377882, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9375000447034836, + "step": 1406 + }, + { + "completion_length": 920.1719207763672, + "epoch": 0.42028227914270777, + "grad_norm": 1.9928267002105713, + "kl": 0.9716796875, + "learning_rate": 7.468169108295295e-07, + "loss": 0.1201, + "reward": 1.1428572237491608, + "reward_std": 0.227925356477499, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.9598214775323868, + "step": 1407 + }, + { + "completion_length": 911.3772735595703, + "epoch": 0.42058098723022924, + "grad_norm": 2.0958492755889893, + "kl": 1.2265625, + "learning_rate": 7.463947159152692e-07, + "loss": 0.1481, + "reward": 1.051339328289032, + "reward_std": 0.2701086886227131, + "rewards/accuracy_reward": 0.11830357927829027, + "rewards/format_reward": 0.9330357611179352, + "step": 1408 + }, + { + "completion_length": 934.1719207763672, + "epoch": 0.4208796953177507, + "grad_norm": 6.034945964813232, + "kl": 1.134765625, + "learning_rate": 7.459723073428067e-07, + "loss": 0.1586, + "reward": 1.1227678954601288, + "reward_std": 0.29551317542791367, + "rewards/accuracy_reward": 0.1785714328289032, + "rewards/format_reward": 0.9441964626312256, + "step": 1409 + }, + { + "completion_length": 928.2634429931641, + "epoch": 0.4211784034052722, + "grad_norm": 1.4162214994430542, + "kl": 0.8896484375, + "learning_rate": 7.455496855716813e-07, + "loss": 0.0975, + "reward": 1.0848214775323868, + "reward_std": 0.2821077071130276, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.941964328289032, + "step": 1410 + }, + { + "completion_length": 889.9687957763672, + "epoch": 0.42147711149279365, + "grad_norm": 2.28924560546875, + "kl": 1.0166015625, + "learning_rate": 7.45126851061664e-07, + "loss": 0.0859, + "reward": 1.0892857760190964, + "reward_std": 0.2801487147808075, + "rewards/accuracy_reward": 0.1473214311990887, + "rewards/format_reward": 0.9419643133878708, + "step": 1411 + }, + { + "completion_length": 968.3683319091797, + "epoch": 0.4217758195803151, + "grad_norm": 1.612638235092163, + "kl": 1.0009765625, + "learning_rate": 7.447038042727571e-07, + "loss": 0.0854, + "reward": 1.1383928954601288, + "reward_std": 0.30752529948949814, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.9352678954601288, + "step": 1412 + }, + { + "completion_length": 966.4397583007812, + "epoch": 0.4220745276678366, + "grad_norm": 0.8966812491416931, + "kl": 0.59765625, + "learning_rate": 7.442805456651941e-07, + "loss": 0.0454, + "reward": 1.1607143580913544, + "reward_std": 0.21526320278644562, + "rewards/accuracy_reward": 0.1852678693830967, + "rewards/format_reward": 0.9754464775323868, + "step": 1413 + }, + { + "completion_length": 946.1049499511719, + "epoch": 0.42237323575535807, + "grad_norm": 0.9223674535751343, + "kl": 0.7060546875, + "learning_rate": 7.438570756994391e-07, + "loss": 0.0624, + "reward": 1.0066964477300644, + "reward_std": 0.2767515294253826, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.9308036118745804, + "step": 1414 + }, + { + "completion_length": 958.404052734375, + "epoch": 0.42267194384287954, + "grad_norm": 154.04747009277344, + "kl": 2.8525390625, + "learning_rate": 7.434333948361857e-07, + "loss": 0.1835, + "reward": 1.0758929252624512, + "reward_std": 0.2577466554939747, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.9598214626312256, + "step": 1415 + }, + { + "completion_length": 958.4799652099609, + "epoch": 0.422970651930401, + "grad_norm": 0.9951699376106262, + "kl": 0.6591796875, + "learning_rate": 7.430095035363572e-07, + "loss": 0.0383, + "reward": 1.1116071790456772, + "reward_std": 0.14958296157419682, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.964285746216774, + "step": 1416 + }, + { + "completion_length": 943.0268402099609, + "epoch": 0.4232693600179225, + "grad_norm": 1.0060070753097534, + "kl": 0.54150390625, + "learning_rate": 7.425854022611059e-07, + "loss": 0.0599, + "reward": 1.149553656578064, + "reward_std": 0.25888729095458984, + "rewards/accuracy_reward": 0.20758929569274187, + "rewards/format_reward": 0.941964328289032, + "step": 1417 + }, + { + "completion_length": 869.810302734375, + "epoch": 0.42356806810544395, + "grad_norm": 1.6287455558776855, + "kl": 0.51318359375, + "learning_rate": 7.421610914718122e-07, + "loss": 0.0775, + "reward": 0.9933036118745804, + "reward_std": 0.18817430920898914, + "rewards/accuracy_reward": 0.031250001629814506, + "rewards/format_reward": 0.9620536118745804, + "step": 1418 + }, + { + "completion_length": 874.935302734375, + "epoch": 0.4238667761929654, + "grad_norm": 3.3262219429016113, + "kl": 0.6259765625, + "learning_rate": 7.41736571630085e-07, + "loss": 0.0717, + "reward": 1.1339286416769028, + "reward_std": 0.307668074965477, + "rewards/accuracy_reward": 0.18973215157166123, + "rewards/format_reward": 0.9441964775323868, + "step": 1419 + }, + { + "completion_length": 940.0491333007812, + "epoch": 0.4241654842804869, + "grad_norm": 1.2162278890609741, + "kl": 0.5712890625, + "learning_rate": 7.413118431977598e-07, + "loss": 0.0582, + "reward": 1.1049107611179352, + "reward_std": 0.18284714221954346, + "rewards/accuracy_reward": 0.12500000651925802, + "rewards/format_reward": 0.9799107611179352, + "step": 1420 + }, + { + "completion_length": 831.3549346923828, + "epoch": 0.42446419236800836, + "grad_norm": 1.6418143510818481, + "kl": 0.5439453125, + "learning_rate": 7.408869066369e-07, + "loss": 0.082, + "reward": 1.1026786267757416, + "reward_std": 0.23846041411161423, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.96651791036129, + "step": 1421 + }, + { + "completion_length": 883.763427734375, + "epoch": 0.42476290045552983, + "grad_norm": 1.0898375511169434, + "kl": 0.404296875, + "learning_rate": 7.404617624097948e-07, + "loss": 0.0848, + "reward": 1.2165178954601288, + "reward_std": 0.2845405079424381, + "rewards/accuracy_reward": 0.254464291036129, + "rewards/format_reward": 0.9620536267757416, + "step": 1422 + }, + { + "completion_length": 893.0312805175781, + "epoch": 0.4250616085430513, + "grad_norm": 1.4178918600082397, + "kl": 0.50244140625, + "learning_rate": 7.400364109789591e-07, + "loss": 0.1095, + "reward": 1.0915179252624512, + "reward_std": 0.222233384847641, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.9598214626312256, + "step": 1423 + }, + { + "completion_length": 859.5647583007812, + "epoch": 0.4253603166305728, + "grad_norm": 0.8913455605506897, + "kl": 0.50439453125, + "learning_rate": 7.396108528071339e-07, + "loss": 0.0115, + "reward": 1.1026785969734192, + "reward_std": 0.15652829594910145, + "rewards/accuracy_reward": 0.13392857951112092, + "rewards/format_reward": 0.9687500447034836, + "step": 1424 + }, + { + "completion_length": 849.5669860839844, + "epoch": 0.42565902471809425, + "grad_norm": 0.8622751832008362, + "kl": 0.484375, + "learning_rate": 7.391850883572849e-07, + "loss": 0.096, + "reward": 1.1584821939468384, + "reward_std": 0.19964439049363136, + "rewards/accuracy_reward": 0.19196430034935474, + "rewards/format_reward": 0.96651791036129, + "step": 1425 + }, + { + "completion_length": 853.7567291259766, + "epoch": 0.4259577328056157, + "grad_norm": 1.066535472869873, + "kl": 0.50341796875, + "learning_rate": 7.387591180926015e-07, + "loss": 0.098, + "reward": 1.1049107611179352, + "reward_std": 0.25611234083771706, + "rewards/accuracy_reward": 0.14508928917348385, + "rewards/format_reward": 0.9598214626312256, + "step": 1426 + }, + { + "completion_length": 826.5201263427734, + "epoch": 0.4262564408931372, + "grad_norm": 2.9296092987060547, + "kl": 0.509765625, + "learning_rate": 7.383329424764982e-07, + "loss": 0.1048, + "reward": 1.2187500596046448, + "reward_std": 0.259705375880003, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.9687500149011612, + "step": 1427 + }, + { + "completion_length": 894.8594207763672, + "epoch": 0.42655514898065866, + "grad_norm": 1.1288480758666992, + "kl": 0.6220703125, + "learning_rate": 7.379065619726123e-07, + "loss": 0.0906, + "reward": 1.1316964626312256, + "reward_std": 0.34485872834920883, + "rewards/accuracy_reward": 0.2053571566939354, + "rewards/format_reward": 0.926339328289032, + "step": 1428 + }, + { + "completion_length": 890.6094207763672, + "epoch": 0.42685385706818013, + "grad_norm": 1.1106340885162354, + "kl": 0.6728515625, + "learning_rate": 7.374799770448036e-07, + "loss": 0.127, + "reward": 1.1651786267757416, + "reward_std": 0.3150978311896324, + "rewards/accuracy_reward": 0.22767857555299997, + "rewards/format_reward": 0.9375000447034836, + "step": 1429 + }, + { + "completion_length": 834.997802734375, + "epoch": 0.4271525651557016, + "grad_norm": 1.1909501552581787, + "kl": 0.556640625, + "learning_rate": 7.370531881571548e-07, + "loss": 0.1155, + "reward": 1.0736607313156128, + "reward_std": 0.2705194056034088, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.9531250447034836, + "step": 1430 + }, + { + "completion_length": 948.4308624267578, + "epoch": 0.4274512732432231, + "grad_norm": 1.500737190246582, + "kl": 0.6337890625, + "learning_rate": 7.366261957739705e-07, + "loss": 0.1657, + "reward": 1.0736607760190964, + "reward_std": 0.2890625521540642, + "rewards/accuracy_reward": 0.14062500977888703, + "rewards/format_reward": 0.9330357611179352, + "step": 1431 + }, + { + "completion_length": 822.3482513427734, + "epoch": 0.42774998133074454, + "grad_norm": 1.534435510635376, + "kl": 0.671875, + "learning_rate": 7.361990003597767e-07, + "loss": 0.1322, + "reward": 1.1160714775323868, + "reward_std": 0.26588382199406624, + "rewards/accuracy_reward": 0.16071429220028222, + "rewards/format_reward": 0.9553571939468384, + "step": 1432 + }, + { + "completion_length": 871.1629943847656, + "epoch": 0.428048689418266, + "grad_norm": 1.459674596786499, + "kl": 1.10546875, + "learning_rate": 7.357716023793199e-07, + "loss": 0.1523, + "reward": 1.098214328289032, + "reward_std": 0.2557775266468525, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.933035746216774, + "step": 1433 + }, + { + "completion_length": 878.4219055175781, + "epoch": 0.4283473975057875, + "grad_norm": 1.0500961542129517, + "kl": 1.0263671875, + "learning_rate": 7.353440022975674e-07, + "loss": 0.1607, + "reward": 1.0468750596046448, + "reward_std": 0.2801683284342289, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.9308036118745804, + "step": 1434 + }, + { + "completion_length": 841.6339874267578, + "epoch": 0.42864610559330896, + "grad_norm": 1.564186692237854, + "kl": 0.9931640625, + "learning_rate": 7.349162005797058e-07, + "loss": 0.1008, + "reward": 1.0892857909202576, + "reward_std": 0.21757392585277557, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9419643431901932, + "step": 1435 + }, + { + "completion_length": 793.7902069091797, + "epoch": 0.42894481368083043, + "grad_norm": 1.3996422290802002, + "kl": 0.99609375, + "learning_rate": 7.344881976911419e-07, + "loss": 0.1277, + "reward": 1.113839328289032, + "reward_std": 0.21520761400461197, + "rewards/accuracy_reward": 0.15401786379516125, + "rewards/format_reward": 0.9598214775323868, + "step": 1436 + }, + { + "completion_length": 897.4911193847656, + "epoch": 0.4292435217683519, + "grad_norm": 1.3917301893234253, + "kl": 0.9033203125, + "learning_rate": 7.340599940975005e-07, + "loss": 0.0948, + "reward": 1.0714285969734192, + "reward_std": 0.21477236971259117, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.9508928954601288, + "step": 1437 + }, + { + "completion_length": 849.1250305175781, + "epoch": 0.42954222985587337, + "grad_norm": 2.1561639308929443, + "kl": 0.9736328125, + "learning_rate": 7.336315902646255e-07, + "loss": 0.1894, + "reward": 1.0736607611179352, + "reward_std": 0.34804149717092514, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.9308036118745804, + "step": 1438 + }, + { + "completion_length": 782.7098693847656, + "epoch": 0.42984093794339484, + "grad_norm": 2.04347825050354, + "kl": 0.794921875, + "learning_rate": 7.332029866585781e-07, + "loss": 0.131, + "reward": 1.1406250298023224, + "reward_std": 0.2228389009833336, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9531250298023224, + "step": 1439 + }, + { + "completion_length": 773.8125305175781, + "epoch": 0.4301396460309163, + "grad_norm": 1.07760751247406, + "kl": 0.76953125, + "learning_rate": 7.32774183745637e-07, + "loss": 0.1092, + "reward": 1.0915179252624512, + "reward_std": 0.22711247578263283, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.9642857611179352, + "step": 1440 + }, + { + "completion_length": 793.4888763427734, + "epoch": 0.4304383541184378, + "grad_norm": 1.6022993326187134, + "kl": 0.8291015625, + "learning_rate": 7.323451819922979e-07, + "loss": 0.1576, + "reward": 1.073660746216774, + "reward_std": 0.2123720459640026, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.9531250298023224, + "step": 1441 + }, + { + "completion_length": 897.2500610351562, + "epoch": 0.43073706220595925, + "grad_norm": 1.956619381904602, + "kl": 0.9482421875, + "learning_rate": 7.319159818652725e-07, + "loss": 0.1347, + "reward": 1.1093750596046448, + "reward_std": 0.2773032858967781, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.948660746216774, + "step": 1442 + }, + { + "completion_length": 910.4397583007812, + "epoch": 0.43103577029348067, + "grad_norm": 2.5708229541778564, + "kl": 1.087890625, + "learning_rate": 7.314865838314885e-07, + "loss": 0.1561, + "reward": 1.0022321790456772, + "reward_std": 0.25464750826358795, + "rewards/accuracy_reward": 0.06473214598372579, + "rewards/format_reward": 0.9375000298023224, + "step": 1443 + }, + { + "completion_length": 854.0625457763672, + "epoch": 0.43133447838100214, + "grad_norm": 5.857600212097168, + "kl": 1.42578125, + "learning_rate": 7.310569883580887e-07, + "loss": 0.3068, + "reward": 0.9441964775323868, + "reward_std": 0.3358064517378807, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.8883928954601288, + "step": 1444 + }, + { + "completion_length": 884.6004943847656, + "epoch": 0.4316331864685236, + "grad_norm": 1.9274861812591553, + "kl": 0.87890625, + "learning_rate": 7.306271959124313e-07, + "loss": 0.1019, + "reward": 1.0446428954601288, + "reward_std": 0.2430613748729229, + "rewards/accuracy_reward": 0.08928572107106447, + "rewards/format_reward": 0.9553571790456772, + "step": 1445 + }, + { + "completion_length": 833.8504791259766, + "epoch": 0.4319318945560451, + "grad_norm": 1.0534968376159668, + "kl": 0.67236328125, + "learning_rate": 7.301972069620881e-07, + "loss": 0.0744, + "reward": 1.111607164144516, + "reward_std": 0.29177702963352203, + "rewards/accuracy_reward": 0.17633929941803217, + "rewards/format_reward": 0.9352678954601288, + "step": 1446 + }, + { + "completion_length": 927.7455749511719, + "epoch": 0.43223060264356655, + "grad_norm": 1.7433277368545532, + "kl": 0.615234375, + "learning_rate": 7.297670219748447e-07, + "loss": 0.0958, + "reward": 1.102678656578064, + "reward_std": 0.27389923855662346, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.9441964626312256, + "step": 1447 + }, + { + "completion_length": 949.1049499511719, + "epoch": 0.432529310731088, + "grad_norm": 0.888603687286377, + "kl": 0.68896484375, + "learning_rate": 7.293366414187008e-07, + "loss": 0.0613, + "reward": 1.0401785969734192, + "reward_std": 0.2823113948106766, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9263393133878708, + "step": 1448 + }, + { + "completion_length": 878.5536193847656, + "epoch": 0.4328280188186095, + "grad_norm": 1.0527973175048828, + "kl": 0.5751953125, + "learning_rate": 7.289060657618677e-07, + "loss": 0.0512, + "reward": 1.1339286267757416, + "reward_std": 0.24450015835464, + "rewards/accuracy_reward": 0.18526786752045155, + "rewards/format_reward": 0.948660746216774, + "step": 1449 + }, + { + "completion_length": 915.1495971679688, + "epoch": 0.43312672690613097, + "grad_norm": 1.2409911155700684, + "kl": 0.65625, + "learning_rate": 7.284752954727698e-07, + "loss": 0.1087, + "reward": 1.1339285969734192, + "reward_std": 0.1955862157046795, + "rewards/accuracy_reward": 0.18303572502918541, + "rewards/format_reward": 0.95089291036129, + "step": 1450 + }, + { + "completion_length": 949.1295013427734, + "epoch": 0.43342543499365244, + "grad_norm": 1.1416428089141846, + "kl": 0.578125, + "learning_rate": 7.280443310200429e-07, + "loss": 0.1177, + "reward": 1.1495536267757416, + "reward_std": 0.2369028590619564, + "rewards/accuracy_reward": 0.19642857694998384, + "rewards/format_reward": 0.9531250298023224, + "step": 1451 + }, + { + "completion_length": 931.9219360351562, + "epoch": 0.4337241430811739, + "grad_norm": 1.8793878555297852, + "kl": 0.52734375, + "learning_rate": 7.27613172872534e-07, + "loss": 0.0983, + "reward": 1.2053572088479996, + "reward_std": 0.25827064737677574, + "rewards/accuracy_reward": 0.2611607275903225, + "rewards/format_reward": 0.9441964626312256, + "step": 1452 + }, + { + "completion_length": 1039.7835388183594, + "epoch": 0.4340228511686954, + "grad_norm": 1.1271302700042725, + "kl": 0.615234375, + "learning_rate": 7.271818214993011e-07, + "loss": 0.0861, + "reward": 1.0022321790456772, + "reward_std": 0.25176773592829704, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.941964328289032, + "step": 1453 + }, + { + "completion_length": 952.2053985595703, + "epoch": 0.43432155925621685, + "grad_norm": 1.0777993202209473, + "kl": 0.64794921875, + "learning_rate": 7.267502773696118e-07, + "loss": 0.0961, + "reward": 1.0803571939468384, + "reward_std": 0.1990894377231598, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.9620536118745804, + "step": 1454 + }, + { + "completion_length": 956.0312805175781, + "epoch": 0.4346202673437383, + "grad_norm": 0.9323623180389404, + "kl": 0.62646484375, + "learning_rate": 7.263185409529444e-07, + "loss": 0.1103, + "reward": 1.0647321939468384, + "reward_std": 0.2440718337893486, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.944196492433548, + "step": 1455 + }, + { + "completion_length": 995.3839874267578, + "epoch": 0.4349189754312598, + "grad_norm": 2.1293277740478516, + "kl": 0.7822265625, + "learning_rate": 7.258866127189854e-07, + "loss": 0.1354, + "reward": 1.0647321939468384, + "reward_std": 0.31189046800136566, + "rewards/accuracy_reward": 0.12946429289877415, + "rewards/format_reward": 0.93526791036129, + "step": 1456 + }, + { + "completion_length": 983.8906707763672, + "epoch": 0.43521768351878126, + "grad_norm": 1.474729061126709, + "kl": 0.7802734375, + "learning_rate": 7.254544931376305e-07, + "loss": 0.135, + "reward": 1.0714286118745804, + "reward_std": 0.31664759665727615, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.9241071939468384, + "step": 1457 + }, + { + "completion_length": 953.27685546875, + "epoch": 0.43551639160630273, + "grad_norm": 1.8329614400863647, + "kl": 0.623046875, + "learning_rate": 7.250221826789836e-07, + "loss": 0.106, + "reward": 1.2120536267757416, + "reward_std": 0.25785211101174355, + "rewards/accuracy_reward": 0.258928582072258, + "rewards/format_reward": 0.9531250447034836, + "step": 1458 + }, + { + "completion_length": 970.6451263427734, + "epoch": 0.4358150996938242, + "grad_norm": 1.9387664794921875, + "kl": 0.6904296875, + "learning_rate": 7.245896818133558e-07, + "loss": 0.0859, + "reward": 1.1473214775323868, + "reward_std": 0.3249499276280403, + "rewards/accuracy_reward": 0.2142857275903225, + "rewards/format_reward": 0.9330357611179352, + "step": 1459 + }, + { + "completion_length": 993.1339721679688, + "epoch": 0.4361138077813457, + "grad_norm": 2.015058755874634, + "kl": 0.6025390625, + "learning_rate": 7.241569910112658e-07, + "loss": 0.0601, + "reward": 1.055803656578064, + "reward_std": 0.30416467040777206, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.9508928954601288, + "step": 1460 + }, + { + "completion_length": 1051.1652069091797, + "epoch": 0.43641251586886715, + "grad_norm": 1.6084243059158325, + "kl": 0.59912109375, + "learning_rate": 7.237241107434389e-07, + "loss": 0.0608, + "reward": 1.0602678954601288, + "reward_std": 0.18388518132269382, + "rewards/accuracy_reward": 0.09375000232830644, + "rewards/format_reward": 0.9665178954601288, + "step": 1461 + }, + { + "completion_length": 1076.3661346435547, + "epoch": 0.4367112239563886, + "grad_norm": 1.7002451419830322, + "kl": 0.54638671875, + "learning_rate": 7.232910414808063e-07, + "loss": 0.0444, + "reward": 1.0468750298023224, + "reward_std": 0.24513672292232513, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.9575893431901932, + "step": 1462 + }, + { + "completion_length": 1019.2768402099609, + "epoch": 0.4370099320439101, + "grad_norm": 1.3582249879837036, + "kl": 0.372314453125, + "learning_rate": 7.228577836945049e-07, + "loss": 0.0271, + "reward": 1.080357164144516, + "reward_std": 0.2287034224718809, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.96651791036129, + "step": 1463 + }, + { + "completion_length": 1084.8861999511719, + "epoch": 0.43730864013143156, + "grad_norm": 1.4753285646438599, + "kl": 0.38671875, + "learning_rate": 7.224243378558768e-07, + "loss": 0.0173, + "reward": 1.0357143431901932, + "reward_std": 0.17289770767092705, + "rewards/accuracy_reward": 0.06919643259607255, + "rewards/format_reward": 0.9665178805589676, + "step": 1464 + }, + { + "completion_length": 977.3594055175781, + "epoch": 0.43760734821895303, + "grad_norm": 2.692168951034546, + "kl": 0.319580078125, + "learning_rate": 7.219907044364682e-07, + "loss": 0.0469, + "reward": 1.0915178954601288, + "reward_std": 0.20993677899241447, + "rewards/accuracy_reward": 0.12500000838190317, + "rewards/format_reward": 0.9665178805589676, + "step": 1465 + }, + { + "completion_length": 1085.2947082519531, + "epoch": 0.4379060563064745, + "grad_norm": 1.2448512315750122, + "kl": 0.388671875, + "learning_rate": 7.215568839080304e-07, + "loss": -0.0047, + "reward": 1.0044643431901932, + "reward_std": 0.23089057207107544, + "rewards/accuracy_reward": 0.06696428754366934, + "rewards/format_reward": 0.9375000596046448, + "step": 1466 + }, + { + "completion_length": 964.591552734375, + "epoch": 0.43820476439399597, + "grad_norm": 1.0740128755569458, + "kl": 0.5673828125, + "learning_rate": 7.211228767425172e-07, + "loss": -0.0092, + "reward": 1.0848214626312256, + "reward_std": 0.22774088382720947, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.9553571939468384, + "step": 1467 + }, + { + "completion_length": 966.7277221679688, + "epoch": 0.43850347248151744, + "grad_norm": 0.7673119902610779, + "kl": 0.41552734375, + "learning_rate": 7.20688683412086e-07, + "loss": 0.0041, + "reward": 1.1495536267757416, + "reward_std": 0.25888172909617424, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.957589328289032, + "step": 1468 + }, + { + "completion_length": 1086.1027374267578, + "epoch": 0.4388021805690389, + "grad_norm": 0.934783935546875, + "kl": 0.48095703125, + "learning_rate": 7.202543043890964e-07, + "loss": 0.0083, + "reward": 0.9732143431901932, + "reward_std": 0.2457234226167202, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.926339328289032, + "step": 1469 + }, + { + "completion_length": 931.0937957763672, + "epoch": 0.4391008886565604, + "grad_norm": 1.2891063690185547, + "kl": 0.5, + "learning_rate": 7.198197401461103e-07, + "loss": -0.004, + "reward": 1.0558035969734192, + "reward_std": 0.2634247988462448, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.9464286118745804, + "step": 1470 + }, + { + "completion_length": 1009.0201568603516, + "epoch": 0.43939959674408186, + "grad_norm": 1.080284595489502, + "kl": 0.5478515625, + "learning_rate": 7.193849911558913e-07, + "loss": 0.0512, + "reward": 1.1473214626312256, + "reward_std": 0.21039610169827938, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9620536118745804, + "step": 1471 + }, + { + "completion_length": 999.6094207763672, + "epoch": 0.4396983048316033, + "grad_norm": 1.0689444541931152, + "kl": 0.501953125, + "learning_rate": 7.189500578914033e-07, + "loss": 0.0043, + "reward": 1.0647321939468384, + "reward_std": 0.20898154005408287, + "rewards/accuracy_reward": 0.10937500675208867, + "rewards/format_reward": 0.9553571790456772, + "step": 1472 + }, + { + "completion_length": 933.9174652099609, + "epoch": 0.4399970129191248, + "grad_norm": 1.0623749494552612, + "kl": 0.437744140625, + "learning_rate": 7.185149408258112e-07, + "loss": 0.0474, + "reward": 1.084821492433548, + "reward_std": 0.17946989089250565, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.9709821790456772, + "step": 1473 + }, + { + "completion_length": 1012.4710235595703, + "epoch": 0.44029572100664627, + "grad_norm": 1.0746158361434937, + "kl": 0.50927734375, + "learning_rate": 7.180796404324797e-07, + "loss": -0.0105, + "reward": 1.0937500596046448, + "reward_std": 0.2751845568418503, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.9486607611179352, + "step": 1474 + }, + { + "completion_length": 1024.2611846923828, + "epoch": 0.44059442909416774, + "grad_norm": 0.8582243919372559, + "kl": 0.50048828125, + "learning_rate": 7.17644157184973e-07, + "loss": -0.0095, + "reward": 1.1361607909202576, + "reward_std": 0.18552878499031067, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9642857611179352, + "step": 1475 + }, + { + "completion_length": 1081.2210083007812, + "epoch": 0.4408931371816892, + "grad_norm": 1.9931443929672241, + "kl": 0.46240234375, + "learning_rate": 7.172084915570541e-07, + "loss": 0.0651, + "reward": 1.1875000596046448, + "reward_std": 0.34567927569150925, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.9575893133878708, + "step": 1476 + }, + { + "completion_length": 1061.7187805175781, + "epoch": 0.4411918452692107, + "grad_norm": 0.7084383368492126, + "kl": 0.46875, + "learning_rate": 7.167726440226846e-07, + "loss": 0.0235, + "reward": 1.066964328289032, + "reward_std": 0.22783702239394188, + "rewards/accuracy_reward": 0.1138392873108387, + "rewards/format_reward": 0.9531250447034836, + "step": 1477 + }, + { + "completion_length": 1044.0312957763672, + "epoch": 0.44149055335673215, + "grad_norm": 0.7327530384063721, + "kl": 0.44091796875, + "learning_rate": 7.16336615056024e-07, + "loss": 0.0196, + "reward": 1.1250000596046448, + "reward_std": 0.20793098397552967, + "rewards/accuracy_reward": 0.15178572479635477, + "rewards/format_reward": 0.9732143431901932, + "step": 1478 + }, + { + "completion_length": 1023.8304138183594, + "epoch": 0.4417892614442536, + "grad_norm": 0.907747745513916, + "kl": 0.43017578125, + "learning_rate": 7.159004051314289e-07, + "loss": -0.0056, + "reward": 1.0758929252624512, + "reward_std": 0.189230315387249, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.9709821939468384, + "step": 1479 + }, + { + "completion_length": 1046.5514068603516, + "epoch": 0.4420879695317751, + "grad_norm": 0.7731805443763733, + "kl": 0.50048828125, + "learning_rate": 7.154640147234529e-07, + "loss": 0.0165, + "reward": 1.1584822237491608, + "reward_std": 0.23001131787896156, + "rewards/accuracy_reward": 0.21428572572767735, + "rewards/format_reward": 0.9441964775323868, + "step": 1480 + }, + { + "completion_length": 1037.5157012939453, + "epoch": 0.44238667761929656, + "grad_norm": 0.8673226237297058, + "kl": 0.40966796875, + "learning_rate": 7.150274443068463e-07, + "loss": 0.0135, + "reward": 1.0870536267757416, + "reward_std": 0.2493886984884739, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.9642857611179352, + "step": 1481 + }, + { + "completion_length": 960.5759429931641, + "epoch": 0.44268538570681804, + "grad_norm": 7.685478210449219, + "kl": 0.55322265625, + "learning_rate": 7.145906943565546e-07, + "loss": -0.0005, + "reward": 1.1138393133878708, + "reward_std": 0.16360868141055107, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9620536118745804, + "step": 1482 + }, + { + "completion_length": 952.7009429931641, + "epoch": 0.4429840937943395, + "grad_norm": 1.0178402662277222, + "kl": 0.55810546875, + "learning_rate": 7.14153765347719e-07, + "loss": 0.0417, + "reward": 1.1383928656578064, + "reward_std": 0.2956918589770794, + "rewards/accuracy_reward": 0.18080357648432255, + "rewards/format_reward": 0.957589328289032, + "step": 1483 + }, + { + "completion_length": 1119.5536193847656, + "epoch": 0.443282801881861, + "grad_norm": 0.8973973393440247, + "kl": 0.36669921875, + "learning_rate": 7.137166577556757e-07, + "loss": 0.0087, + "reward": 1.0982143431901932, + "reward_std": 0.23283173516392708, + "rewards/accuracy_reward": 0.1406250111758709, + "rewards/format_reward": 0.9575893431901932, + "step": 1484 + }, + { + "completion_length": 1032.2545318603516, + "epoch": 0.44358150996938245, + "grad_norm": 1.5550131797790527, + "kl": 0.38916015625, + "learning_rate": 7.132793720559547e-07, + "loss": 0.0503, + "reward": 1.0870536267757416, + "reward_std": 0.24907296523451805, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.957589328289032, + "step": 1485 + }, + { + "completion_length": 1057.9621124267578, + "epoch": 0.44388021805690386, + "grad_norm": 1.3319637775421143, + "kl": 0.34130859375, + "learning_rate": 7.128419087242797e-07, + "loss": 0.0062, + "reward": 1.176339328289032, + "reward_std": 0.29768040776252747, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.9553571939468384, + "step": 1486 + }, + { + "completion_length": 1026.7857360839844, + "epoch": 0.44417892614442533, + "grad_norm": 0.6735997796058655, + "kl": 0.46240234375, + "learning_rate": 7.124042682365685e-07, + "loss": 0.0145, + "reward": 1.0959821939468384, + "reward_std": 0.23928401619195938, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9486607611179352, + "step": 1487 + }, + { + "completion_length": 1009.3192291259766, + "epoch": 0.4444776342319468, + "grad_norm": 2.46466064453125, + "kl": 0.51171875, + "learning_rate": 7.119664510689307e-07, + "loss": 0.0075, + "reward": 1.1361607611179352, + "reward_std": 0.20654458925127983, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9776785969734192, + "step": 1488 + }, + { + "completion_length": 1076.513427734375, + "epoch": 0.4447763423194683, + "grad_norm": 0.9126003384590149, + "kl": 0.3154296875, + "learning_rate": 7.115284576976685e-07, + "loss": 0.0362, + "reward": 1.0245536267757416, + "reward_std": 0.20455339178442955, + "rewards/accuracy_reward": 0.06473214388825, + "rewards/format_reward": 0.9598214775323868, + "step": 1489 + }, + { + "completion_length": 995.5580749511719, + "epoch": 0.44507505040698975, + "grad_norm": 1.54283607006073, + "kl": 0.3876953125, + "learning_rate": 7.110902885992759e-07, + "loss": 0.0344, + "reward": 1.066964328289032, + "reward_std": 0.18415249697864056, + "rewards/accuracy_reward": 0.10491071734577417, + "rewards/format_reward": 0.9620536118745804, + "step": 1490 + }, + { + "completion_length": 956.3393249511719, + "epoch": 0.4453737584945112, + "grad_norm": 1.1026743650436401, + "kl": 0.3818359375, + "learning_rate": 7.10651944250438e-07, + "loss": 0.0182, + "reward": 1.1495536267757416, + "reward_std": 0.2557871453464031, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9553571939468384, + "step": 1491 + }, + { + "completion_length": 1048.216567993164, + "epoch": 0.4456724665820327, + "grad_norm": 1.3278940916061401, + "kl": 0.564453125, + "learning_rate": 7.102134251280302e-07, + "loss": 0.0283, + "reward": 1.0535714775323868, + "reward_std": 0.2799305245280266, + "rewards/accuracy_reward": 0.10267857951112092, + "rewards/format_reward": 0.9508928954601288, + "step": 1492 + }, + { + "completion_length": 984.7344055175781, + "epoch": 0.44597117466955416, + "grad_norm": 1.5397720336914062, + "kl": 0.5546875, + "learning_rate": 7.097747317091183e-07, + "loss": 0.0562, + "reward": 1.0781250298023224, + "reward_std": 0.25378914177417755, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.9419643133878708, + "step": 1493 + }, + { + "completion_length": 880.4219207763672, + "epoch": 0.44626988275707563, + "grad_norm": 7.9988322257995605, + "kl": 0.779052734375, + "learning_rate": 7.09335864470958e-07, + "loss": 0.0612, + "reward": 1.0379464775323868, + "reward_std": 0.2708500698208809, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.9375000447034836, + "step": 1494 + }, + { + "completion_length": 1003.8504791259766, + "epoch": 0.4465685908445971, + "grad_norm": 5.576866149902344, + "kl": 0.80126953125, + "learning_rate": 7.08896823890994e-07, + "loss": 0.0487, + "reward": 1.07589291036129, + "reward_std": 0.20306203421205282, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.9598214626312256, + "step": 1495 + }, + { + "completion_length": 964.0826416015625, + "epoch": 0.4468672989321186, + "grad_norm": 1.9567077159881592, + "kl": 0.5888671875, + "learning_rate": 7.084576104468588e-07, + "loss": -0.0161, + "reward": 1.1116072237491608, + "reward_std": 0.2054926324635744, + "rewards/accuracy_reward": 0.15625000838190317, + "rewards/format_reward": 0.9553571790456772, + "step": 1496 + }, + { + "completion_length": 937.7299499511719, + "epoch": 0.44716600701964004, + "grad_norm": 1.9706050157546997, + "kl": 0.7001953125, + "learning_rate": 7.080182246163741e-07, + "loss": 0.0164, + "reward": 1.0066964626312256, + "reward_std": 0.2549808546900749, + "rewards/accuracy_reward": 0.08035714575089514, + "rewards/format_reward": 0.9263393431901932, + "step": 1497 + }, + { + "completion_length": 1076.7433471679688, + "epoch": 0.4474647151071615, + "grad_norm": 1.400977373123169, + "kl": 0.6162109375, + "learning_rate": 7.075786668775485e-07, + "loss": 0.0476, + "reward": 1.0781250298023224, + "reward_std": 0.22804898768663406, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.9598214477300644, + "step": 1498 + }, + { + "completion_length": 1028.2076263427734, + "epoch": 0.447763423194683, + "grad_norm": 1.0223791599273682, + "kl": 0.64453125, + "learning_rate": 7.071389377085777e-07, + "loss": 0.0536, + "reward": 1.1361607611179352, + "reward_std": 0.2681823195889592, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9598214626312256, + "step": 1499 + }, + { + "completion_length": 938.2656707763672, + "epoch": 0.44806213128220446, + "grad_norm": 2.6675796508789062, + "kl": 0.6416015625, + "learning_rate": 7.066990375878439e-07, + "loss": 0.0234, + "reward": 1.129464328289032, + "reward_std": 0.26070447638630867, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.9553571939468384, + "step": 1500 + }, + { + "completion_length": 941.8616485595703, + "epoch": 0.4483608393697259, + "grad_norm": 1.0908539295196533, + "kl": 0.662109375, + "learning_rate": 7.062589669939154e-07, + "loss": 0.0546, + "reward": 1.2321428954601288, + "reward_std": 0.24417810887098312, + "rewards/accuracy_reward": 0.27008930034935474, + "rewards/format_reward": 0.9620536118745804, + "step": 1501 + }, + { + "completion_length": 901.1116485595703, + "epoch": 0.4486595474572474, + "grad_norm": 1.560289978981018, + "kl": 0.6083984375, + "learning_rate": 7.058187264055459e-07, + "loss": 0.0092, + "reward": 1.0424107611179352, + "reward_std": 0.28148265928030014, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.9397321939468384, + "step": 1502 + }, + { + "completion_length": 1031.5022430419922, + "epoch": 0.44895825554476887, + "grad_norm": 1.7197052240371704, + "kl": 0.828125, + "learning_rate": 7.053783163016739e-07, + "loss": 0.0153, + "reward": 1.0000000298023224, + "reward_std": 0.2599923387169838, + "rewards/accuracy_reward": 0.06026785867288709, + "rewards/format_reward": 0.9397321790456772, + "step": 1503 + }, + { + "completion_length": 1025.2835388183594, + "epoch": 0.44925696363229034, + "grad_norm": 1.5774599313735962, + "kl": 0.6298828125, + "learning_rate": 7.049377371614224e-07, + "loss": 0.036, + "reward": 1.0870536267757416, + "reward_std": 0.19469547644257545, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.9687500447034836, + "step": 1504 + }, + { + "completion_length": 845.841552734375, + "epoch": 0.4495556717198118, + "grad_norm": 3.9808547496795654, + "kl": 0.7119140625, + "learning_rate": 7.044969894640984e-07, + "loss": 0.0358, + "reward": 1.0535714626312256, + "reward_std": 0.19777265936136246, + "rewards/accuracy_reward": 0.10044643259607255, + "rewards/format_reward": 0.9531250447034836, + "step": 1505 + }, + { + "completion_length": 974.5826416015625, + "epoch": 0.4498543798073333, + "grad_norm": 4.084972381591797, + "kl": 0.8291015625, + "learning_rate": 7.040560736891922e-07, + "loss": 0.0522, + "reward": 1.0870536267757416, + "reward_std": 0.22057446092367172, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9598214626312256, + "step": 1506 + }, + { + "completion_length": 919.1920166015625, + "epoch": 0.45015308789485475, + "grad_norm": 2.3222713470458984, + "kl": 1.0087890625, + "learning_rate": 7.036149903163771e-07, + "loss": 0.028, + "reward": 1.082589328289032, + "reward_std": 0.30595041811466217, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9285714775323868, + "step": 1507 + }, + { + "completion_length": 895.3147430419922, + "epoch": 0.4504517959823762, + "grad_norm": 2.4531049728393555, + "kl": 0.85498046875, + "learning_rate": 7.031737398255083e-07, + "loss": 0.0515, + "reward": 1.1584821939468384, + "reward_std": 0.2843700088560581, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9464286118745804, + "step": 1508 + }, + { + "completion_length": 951.7120971679688, + "epoch": 0.4507505040698977, + "grad_norm": 12.208002090454102, + "kl": 0.79296875, + "learning_rate": 7.027323226966232e-07, + "loss": 0.0146, + "reward": 1.0446428805589676, + "reward_std": 0.3023636192083359, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.9375000447034836, + "step": 1509 + }, + { + "completion_length": 891.5937957763672, + "epoch": 0.45104921215741917, + "grad_norm": 5.306052207946777, + "kl": 0.880859375, + "learning_rate": 7.022907394099404e-07, + "loss": 0.042, + "reward": 1.1651785969734192, + "reward_std": 0.30258722975850105, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.9486607611179352, + "step": 1510 + }, + { + "completion_length": 929.0223541259766, + "epoch": 0.45134792024494064, + "grad_norm": 6.505325794219971, + "kl": 0.79296875, + "learning_rate": 7.018489904458592e-07, + "loss": 0.0326, + "reward": 1.0602679252624512, + "reward_std": 0.23969696089625359, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.9620536118745804, + "step": 1511 + }, + { + "completion_length": 1024.732177734375, + "epoch": 0.4516466283324621, + "grad_norm": 1.0376752614974976, + "kl": 0.61962890625, + "learning_rate": 7.014070762849593e-07, + "loss": 0.0455, + "reward": 1.0513393431901932, + "reward_std": 0.23751768097281456, + "rewards/accuracy_reward": 0.10937500675208867, + "rewards/format_reward": 0.9419643133878708, + "step": 1512 + }, + { + "completion_length": 879.0201263427734, + "epoch": 0.4519453364199836, + "grad_norm": 1.8254624605178833, + "kl": 0.5615234375, + "learning_rate": 7.009649974079997e-07, + "loss": 0.0527, + "reward": 1.2611607611179352, + "reward_std": 0.280213650316, + "rewards/accuracy_reward": 0.3058035895228386, + "rewards/format_reward": 0.9553571790456772, + "step": 1513 + }, + { + "completion_length": 887.3995819091797, + "epoch": 0.45224404450750505, + "grad_norm": 2.073286533355713, + "kl": 0.70703125, + "learning_rate": 7.005227542959192e-07, + "loss": 0.037, + "reward": 1.1629464775323868, + "reward_std": 0.1926197912544012, + "rewards/accuracy_reward": 0.20312500838190317, + "rewards/format_reward": 0.9598214775323868, + "step": 1514 + }, + { + "completion_length": 946.0670166015625, + "epoch": 0.4525427525950265, + "grad_norm": 1.7135430574417114, + "kl": 0.64453125, + "learning_rate": 7.000803474298349e-07, + "loss": 0.0296, + "reward": 1.0535714775323868, + "reward_std": 0.21109431982040405, + "rewards/accuracy_reward": 0.09821429033763707, + "rewards/format_reward": 0.9553571790456772, + "step": 1515 + }, + { + "completion_length": 882.1339721679688, + "epoch": 0.452841460682548, + "grad_norm": 2.131434679031372, + "kl": 0.81640625, + "learning_rate": 6.99637777291042e-07, + "loss": -0.0006, + "reward": 1.1294643580913544, + "reward_std": 0.25638051703572273, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9620536118745804, + "step": 1516 + }, + { + "completion_length": 976.7299499511719, + "epoch": 0.45314016877006946, + "grad_norm": 2.3907649517059326, + "kl": 0.8662109375, + "learning_rate": 6.991950443610134e-07, + "loss": 0.0228, + "reward": 1.1004464626312256, + "reward_std": 0.19401276484131813, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.957589328289032, + "step": 1517 + }, + { + "completion_length": 1022.9688110351562, + "epoch": 0.45343887685759093, + "grad_norm": 1.2756787538528442, + "kl": 0.66943359375, + "learning_rate": 6.987521491213992e-07, + "loss": 0.0508, + "reward": 1.1339286267757416, + "reward_std": 0.1989163402467966, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.9553571790456772, + "step": 1518 + }, + { + "completion_length": 977.8795166015625, + "epoch": 0.4537375849451124, + "grad_norm": 1.5851162672042847, + "kl": 0.7158203125, + "learning_rate": 6.983090920540261e-07, + "loss": 0.0169, + "reward": 1.0379464775323868, + "reward_std": 0.17056679166853428, + "rewards/accuracy_reward": 0.06919643259607255, + "rewards/format_reward": 0.9687500298023224, + "step": 1519 + }, + { + "completion_length": 944.4910888671875, + "epoch": 0.4540362930326339, + "grad_norm": 5.8745880126953125, + "kl": 0.80859375, + "learning_rate": 6.978658736408969e-07, + "loss": 0.0564, + "reward": 1.0714286267757416, + "reward_std": 0.25156509689986706, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.9553571939468384, + "step": 1520 + }, + { + "completion_length": 956.4308624267578, + "epoch": 0.45433500112015535, + "grad_norm": 1.6047919988632202, + "kl": 0.7392578125, + "learning_rate": 6.974224943641893e-07, + "loss": 0.0313, + "reward": 1.0424107611179352, + "reward_std": 0.19542381912469864, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.9598214775323868, + "step": 1521 + }, + { + "completion_length": 1019.9464721679688, + "epoch": 0.4546337092076768, + "grad_norm": 2.533127546310425, + "kl": 0.8271484375, + "learning_rate": 6.969789547062569e-07, + "loss": 0.0285, + "reward": 1.0357143431901932, + "reward_std": 0.27702003717422485, + "rewards/accuracy_reward": 0.10714286239817739, + "rewards/format_reward": 0.9285714775323868, + "step": 1522 + }, + { + "completion_length": 993.3772735595703, + "epoch": 0.4549324172951983, + "grad_norm": 1.1756510734558105, + "kl": 0.78125, + "learning_rate": 6.965352551496273e-07, + "loss": 0.0375, + "reward": 1.0223214626312256, + "reward_std": 0.21282707899808884, + "rewards/accuracy_reward": 0.07142857694998384, + "rewards/format_reward": 0.9508928954601288, + "step": 1523 + }, + { + "completion_length": 1024.4442596435547, + "epoch": 0.45523112538271976, + "grad_norm": 0.8866927027702332, + "kl": 0.6416015625, + "learning_rate": 6.960913961770021e-07, + "loss": 0.0433, + "reward": 1.0669643580913544, + "reward_std": 0.17072746716439724, + "rewards/accuracy_reward": 0.10267857322469354, + "rewards/format_reward": 0.964285746216774, + "step": 1524 + }, + { + "completion_length": 1001.4464721679688, + "epoch": 0.45552983347024123, + "grad_norm": 2.2108070850372314, + "kl": 0.6748046875, + "learning_rate": 6.956473782712562e-07, + "loss": 0.0493, + "reward": 1.0714285969734192, + "reward_std": 0.2691582925617695, + "rewards/accuracy_reward": 0.12500000931322575, + "rewards/format_reward": 0.9464286118745804, + "step": 1525 + }, + { + "completion_length": 1051.558090209961, + "epoch": 0.4558285415577627, + "grad_norm": 1.7768619060516357, + "kl": 0.65625, + "learning_rate": 6.952032019154378e-07, + "loss": 0.0415, + "reward": 1.1383929252624512, + "reward_std": 0.2614155001938343, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.964285746216774, + "step": 1526 + }, + { + "completion_length": 970.4040679931641, + "epoch": 0.45612724964528417, + "grad_norm": 2.0167360305786133, + "kl": 0.66650390625, + "learning_rate": 6.947588675927673e-07, + "loss": 0.0357, + "reward": 1.1964285969734192, + "reward_std": 0.20835253968834877, + "rewards/accuracy_reward": 0.23660715040750802, + "rewards/format_reward": 0.9598214626312256, + "step": 1527 + }, + { + "completion_length": 890.1406707763672, + "epoch": 0.45642595773280564, + "grad_norm": 3.3217782974243164, + "kl": 0.83203125, + "learning_rate": 6.943143757866365e-07, + "loss": 0.0958, + "reward": 1.1205357909202576, + "reward_std": 0.29758910089731216, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.9419643431901932, + "step": 1528 + }, + { + "completion_length": 925.2344207763672, + "epoch": 0.45672466582032706, + "grad_norm": 16.850238800048828, + "kl": 0.9697265625, + "learning_rate": 6.938697269806094e-07, + "loss": 0.0181, + "reward": 1.1049107611179352, + "reward_std": 0.20506335888057947, + "rewards/accuracy_reward": 0.12500000488944352, + "rewards/format_reward": 0.979910746216774, + "step": 1529 + }, + { + "completion_length": 943.7969207763672, + "epoch": 0.45702337390784853, + "grad_norm": 38.428951263427734, + "kl": 1.60546875, + "learning_rate": 6.934249216584202e-07, + "loss": 0.0717, + "reward": 1.1986607313156128, + "reward_std": 0.22692430391907692, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9754464626312256, + "step": 1530 + }, + { + "completion_length": 945.6964569091797, + "epoch": 0.45732208199537, + "grad_norm": 58.607879638671875, + "kl": 1.9951171875, + "learning_rate": 6.929799603039731e-07, + "loss": 0.0956, + "reward": 1.0468750596046448, + "reward_std": 0.19058025628328323, + "rewards/accuracy_reward": 0.08482143026776612, + "rewards/format_reward": 0.9620536267757416, + "step": 1531 + }, + { + "completion_length": 881.7388916015625, + "epoch": 0.45762079008289147, + "grad_norm": 48.048309326171875, + "kl": 2.17578125, + "learning_rate": 6.925348434013428e-07, + "loss": 0.0947, + "reward": 1.1986607611179352, + "reward_std": 0.25058313459157944, + "rewards/accuracy_reward": 0.243303582072258, + "rewards/format_reward": 0.9553571790456772, + "step": 1532 + }, + { + "completion_length": 913.4107666015625, + "epoch": 0.45791949817041294, + "grad_norm": 20.593124389648438, + "kl": 1.3876953125, + "learning_rate": 6.920895714347729e-07, + "loss": 0.0508, + "reward": 1.0424107611179352, + "reward_std": 0.18062568083405495, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.9642857313156128, + "step": 1533 + }, + { + "completion_length": 939.3460235595703, + "epoch": 0.4582182062579344, + "grad_norm": 1.8999813795089722, + "kl": 0.720703125, + "learning_rate": 6.916441448886754e-07, + "loss": 0.0364, + "reward": 1.082589328289032, + "reward_std": 0.27238572388887405, + "rewards/accuracy_reward": 0.12946429196745157, + "rewards/format_reward": 0.9531250447034836, + "step": 1534 + }, + { + "completion_length": 1014.6964721679688, + "epoch": 0.4585169143454559, + "grad_norm": 3.162628412246704, + "kl": 0.6884765625, + "learning_rate": 6.911985642476309e-07, + "loss": 0.0818, + "reward": 1.1294643431901932, + "reward_std": 0.2472698837518692, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9486607611179352, + "step": 1535 + }, + { + "completion_length": 955.9107666015625, + "epoch": 0.45881562243297735, + "grad_norm": 2.1909103393554688, + "kl": 0.609375, + "learning_rate": 6.907528299963875e-07, + "loss": 0.0088, + "reward": 1.113839328289032, + "reward_std": 0.22537768259644508, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9575893133878708, + "step": 1536 + }, + { + "completion_length": 1011.2232513427734, + "epoch": 0.4591143305204988, + "grad_norm": 1.932998776435852, + "kl": 0.609375, + "learning_rate": 6.903069426198605e-07, + "loss": 0.0376, + "reward": 1.1205357611179352, + "reward_std": 0.2206650786101818, + "rewards/accuracy_reward": 0.15848215529695153, + "rewards/format_reward": 0.9620536118745804, + "step": 1537 + }, + { + "completion_length": 979.6629791259766, + "epoch": 0.4594130386080203, + "grad_norm": 1.2582365274429321, + "kl": 0.654296875, + "learning_rate": 6.898609026031312e-07, + "loss": 0.0404, + "reward": 1.1250000447034836, + "reward_std": 0.2327822670340538, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.9375000298023224, + "step": 1538 + }, + { + "completion_length": 984.8013916015625, + "epoch": 0.45971174669554177, + "grad_norm": 1.9450641870498657, + "kl": 0.62060546875, + "learning_rate": 6.894147104314478e-07, + "loss": 0.032, + "reward": 1.1361607909202576, + "reward_std": 0.24324511364102364, + "rewards/accuracy_reward": 0.17857143748551607, + "rewards/format_reward": 0.9575893431901932, + "step": 1539 + }, + { + "completion_length": 908.0245971679688, + "epoch": 0.46001045478306324, + "grad_norm": 1.319219946861267, + "kl": 0.55810546875, + "learning_rate": 6.889683665902237e-07, + "loss": 0.0168, + "reward": 1.0602679252624512, + "reward_std": 0.19480964168906212, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.9642857611179352, + "step": 1540 + }, + { + "completion_length": 949.6629943847656, + "epoch": 0.4603091628705847, + "grad_norm": 5.231536865234375, + "kl": 0.91796875, + "learning_rate": 6.885218715650369e-07, + "loss": 0.1024, + "reward": 1.0491071790456772, + "reward_std": 0.276202991604805, + "rewards/accuracy_reward": 0.1138392873108387, + "rewards/format_reward": 0.9352678954601288, + "step": 1541 + }, + { + "completion_length": 982.9107513427734, + "epoch": 0.4606078709581062, + "grad_norm": 5.515761852264404, + "kl": 0.7802734375, + "learning_rate": 6.880752258416306e-07, + "loss": 0.0542, + "reward": 1.1026785969734192, + "reward_std": 0.30361902713775635, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9330357760190964, + "step": 1542 + }, + { + "completion_length": 1014.9375457763672, + "epoch": 0.46090657904562765, + "grad_norm": 2.5224621295928955, + "kl": 0.724609375, + "learning_rate": 6.876284299059113e-07, + "loss": 0.0187, + "reward": 0.9754464626312256, + "reward_std": 0.19805934093892574, + "rewards/accuracy_reward": 0.02455357275903225, + "rewards/format_reward": 0.9508928954601288, + "step": 1543 + }, + { + "completion_length": 960.3527069091797, + "epoch": 0.4612052871331491, + "grad_norm": 2.0471434593200684, + "kl": 0.5654296875, + "learning_rate": 6.871814842439494e-07, + "loss": 0.0414, + "reward": 1.1205357611179352, + "reward_std": 0.2479836754500866, + "rewards/accuracy_reward": 0.16517857648432255, + "rewards/format_reward": 0.9553571790456772, + "step": 1544 + }, + { + "completion_length": 932.6942596435547, + "epoch": 0.4615039952206706, + "grad_norm": 1.77581787109375, + "kl": 0.4375, + "learning_rate": 6.867343893419778e-07, + "loss": 0.0405, + "reward": 1.165178656578064, + "reward_std": 0.3166917636990547, + "rewards/accuracy_reward": 0.2142857275903225, + "rewards/format_reward": 0.9508928954601288, + "step": 1545 + }, + { + "completion_length": 981.2433471679688, + "epoch": 0.46180270330819206, + "grad_norm": 1.0389384031295776, + "kl": 0.7646484375, + "learning_rate": 6.86287145686392e-07, + "loss": 0.0629, + "reward": 1.0290178954601288, + "reward_std": 0.24499113112688065, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.948660746216774, + "step": 1546 + }, + { + "completion_length": 1030.0245971679688, + "epoch": 0.46210141139571353, + "grad_norm": 2.2684171199798584, + "kl": 0.66748046875, + "learning_rate": 6.858397537637492e-07, + "loss": 0.0454, + "reward": 1.037946492433548, + "reward_std": 0.24279095232486725, + "rewards/accuracy_reward": 0.09375000302679837, + "rewards/format_reward": 0.944196492433548, + "step": 1547 + }, + { + "completion_length": 991.9353179931641, + "epoch": 0.462400119483235, + "grad_norm": 3.1855998039245605, + "kl": 0.6708984375, + "learning_rate": 6.853922140607683e-07, + "loss": 0.057, + "reward": 1.149553582072258, + "reward_std": 0.25682447105646133, + "rewards/accuracy_reward": 0.20312501536682248, + "rewards/format_reward": 0.9464285969734192, + "step": 1548 + }, + { + "completion_length": 892.9263763427734, + "epoch": 0.4626988275707565, + "grad_norm": 2.6658835411071777, + "kl": 0.7646484375, + "learning_rate": 6.849445270643282e-07, + "loss": 0.0736, + "reward": 1.0089286416769028, + "reward_std": 0.19922013953328133, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.9419643431901932, + "step": 1549 + }, + { + "completion_length": 907.3370971679688, + "epoch": 0.46299753565827795, + "grad_norm": 7.209531307220459, + "kl": 0.986328125, + "learning_rate": 6.844966932614686e-07, + "loss": 0.0386, + "reward": 1.0959821939468384, + "reward_std": 0.2972991615533829, + "rewards/accuracy_reward": 0.15848215413279831, + "rewards/format_reward": 0.9375000447034836, + "step": 1550 + }, + { + "completion_length": 942.8861999511719, + "epoch": 0.4632962437457994, + "grad_norm": 12.298089027404785, + "kl": 1.3798828125, + "learning_rate": 6.840487131393888e-07, + "loss": 0.0529, + "reward": 1.1183036267757416, + "reward_std": 0.2871335484087467, + "rewards/accuracy_reward": 0.17633929662406445, + "rewards/format_reward": 0.941964328289032, + "step": 1551 + }, + { + "completion_length": 1025.323715209961, + "epoch": 0.4635949518333209, + "grad_norm": 18.022104263305664, + "kl": 1.578125, + "learning_rate": 6.836005871854474e-07, + "loss": 0.1291, + "reward": 1.1272321790456772, + "reward_std": 0.31083744019269943, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.9241071939468384, + "step": 1552 + }, + { + "completion_length": 883.7924652099609, + "epoch": 0.46389365992084236, + "grad_norm": 3.4829959869384766, + "kl": 0.7041015625, + "learning_rate": 6.831523158871612e-07, + "loss": 0.0729, + "reward": 1.0803572237491608, + "reward_std": 0.25555235520005226, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.9598214775323868, + "step": 1553 + }, + { + "completion_length": 1005.7857666015625, + "epoch": 0.46419236800836383, + "grad_norm": 0.9204956293106079, + "kl": 0.77392578125, + "learning_rate": 6.82703899732206e-07, + "loss": 0.0458, + "reward": 1.0245536118745804, + "reward_std": 0.19213085249066353, + "rewards/accuracy_reward": 0.07589286286383867, + "rewards/format_reward": 0.9486607611179352, + "step": 1554 + }, + { + "completion_length": 945.1228179931641, + "epoch": 0.4644910760958853, + "grad_norm": 2.0567688941955566, + "kl": 0.736328125, + "learning_rate": 6.82255339208414e-07, + "loss": 0.0375, + "reward": 1.082589328289032, + "reward_std": 0.2167803067713976, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.9598214626312256, + "step": 1555 + }, + { + "completion_length": 962.1116333007812, + "epoch": 0.4647897841834068, + "grad_norm": 2.3318307399749756, + "kl": 0.69921875, + "learning_rate": 6.818066348037755e-07, + "loss": 0.0142, + "reward": 1.100446492433548, + "reward_std": 0.29399412870407104, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.941964328289032, + "step": 1556 + }, + { + "completion_length": 946.1808471679688, + "epoch": 0.46508849227092824, + "grad_norm": 1.7894855737686157, + "kl": 0.7607421875, + "learning_rate": 6.813577870064366e-07, + "loss": 0.0945, + "reward": 1.0379465073347092, + "reward_std": 0.25842443481087685, + "rewards/accuracy_reward": 0.10491071688011289, + "rewards/format_reward": 0.9330357611179352, + "step": 1557 + }, + { + "completion_length": 1007.4241790771484, + "epoch": 0.4653872003584497, + "grad_norm": 2.755030393600464, + "kl": 0.791015625, + "learning_rate": 6.809087963047e-07, + "loss": 0.041, + "reward": 1.1183036267757416, + "reward_std": 0.24860962852835655, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9508928954601288, + "step": 1558 + }, + { + "completion_length": 907.0647888183594, + "epoch": 0.4656859084459712, + "grad_norm": 1.6545350551605225, + "kl": 0.974609375, + "learning_rate": 6.804596631870234e-07, + "loss": 0.0612, + "reward": 1.0691964626312256, + "reward_std": 0.28911879286170006, + "rewards/accuracy_reward": 0.1116071455180645, + "rewards/format_reward": 0.957589328289032, + "step": 1559 + }, + { + "completion_length": 1030.6942443847656, + "epoch": 0.46598461653349266, + "grad_norm": 6.257719993591309, + "kl": 1.2294921875, + "learning_rate": 6.800103881420198e-07, + "loss": 0.0633, + "reward": 1.0290178954601288, + "reward_std": 0.2747988738119602, + "rewards/accuracy_reward": 0.10937500093132257, + "rewards/format_reward": 0.9196428954601288, + "step": 1560 + }, + { + "completion_length": 980.2634582519531, + "epoch": 0.46628332462101413, + "grad_norm": 5.7038397789001465, + "kl": 1.0576171875, + "learning_rate": 6.795609716584562e-07, + "loss": 0.0534, + "reward": 1.073660746216774, + "reward_std": 0.2855752743780613, + "rewards/accuracy_reward": 0.1339285729918629, + "rewards/format_reward": 0.939732164144516, + "step": 1561 + }, + { + "completion_length": 900.6473541259766, + "epoch": 0.4665820327085356, + "grad_norm": 7.572509765625, + "kl": 1.0966796875, + "learning_rate": 6.791114142252538e-07, + "loss": 0.0448, + "reward": 1.0825893580913544, + "reward_std": 0.2443261779844761, + "rewards/accuracy_reward": 0.11383928917348385, + "rewards/format_reward": 0.9687500447034836, + "step": 1562 + }, + { + "completion_length": 918.1629638671875, + "epoch": 0.46688074079605707, + "grad_norm": 1.0329463481903076, + "kl": 0.7646484375, + "learning_rate": 6.78661716331487e-07, + "loss": 0.0146, + "reward": 1.0982143133878708, + "reward_std": 0.2364058718085289, + "rewards/accuracy_reward": 0.15178572107106447, + "rewards/format_reward": 0.9464286118745804, + "step": 1563 + }, + { + "completion_length": 986.3214721679688, + "epoch": 0.46717944888357854, + "grad_norm": 2.209886312484741, + "kl": 0.755859375, + "learning_rate": 6.782118784663829e-07, + "loss": 0.0296, + "reward": 1.0558036267757416, + "reward_std": 0.28941546753048897, + "rewards/accuracy_reward": 0.1116071455180645, + "rewards/format_reward": 0.9441964775323868, + "step": 1564 + }, + { + "completion_length": 921.9241638183594, + "epoch": 0.4674781569711, + "grad_norm": 4.344401836395264, + "kl": 0.57421875, + "learning_rate": 6.777619011193213e-07, + "loss": 0.0669, + "reward": 1.1049107611179352, + "reward_std": 0.22150767967104912, + "rewards/accuracy_reward": 0.15401786379516125, + "rewards/format_reward": 0.9508928805589676, + "step": 1565 + }, + { + "completion_length": 963.2321929931641, + "epoch": 0.4677768650586215, + "grad_norm": 2.6981422901153564, + "kl": 0.6611328125, + "learning_rate": 6.773117847798333e-07, + "loss": 0.0262, + "reward": 1.0513393431901932, + "reward_std": 0.24979346618056297, + "rewards/accuracy_reward": 0.10044643515720963, + "rewards/format_reward": 0.9508928954601288, + "step": 1566 + }, + { + "completion_length": 796.3013610839844, + "epoch": 0.46807557314614295, + "grad_norm": 1.3051201105117798, + "kl": 0.7353515625, + "learning_rate": 6.768615299376013e-07, + "loss": 0.0074, + "reward": 1.0736607611179352, + "reward_std": 0.26583848893642426, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9464285969734192, + "step": 1567 + }, + { + "completion_length": 886.6473541259766, + "epoch": 0.4683742812336644, + "grad_norm": 1.6781140565872192, + "kl": 0.609375, + "learning_rate": 6.764111370824585e-07, + "loss": 0.1099, + "reward": 1.1986607611179352, + "reward_std": 0.2108034621924162, + "rewards/accuracy_reward": 0.24553571874275804, + "rewards/format_reward": 0.9531250447034836, + "step": 1568 + }, + { + "completion_length": 993.5022888183594, + "epoch": 0.4686729893211859, + "grad_norm": 3.50368332862854, + "kl": 0.6357421875, + "learning_rate": 6.759606067043882e-07, + "loss": 0.0491, + "reward": 1.0647321790456772, + "reward_std": 0.26585014536976814, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.948660746216774, + "step": 1569 + }, + { + "completion_length": 1070.3661193847656, + "epoch": 0.46897169740870737, + "grad_norm": 18.175630569458008, + "kl": 0.99462890625, + "learning_rate": 6.755099392935232e-07, + "loss": 0.0501, + "reward": 1.064732164144516, + "reward_std": 0.25419359654188156, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9397321790456772, + "step": 1570 + }, + { + "completion_length": 969.0625457763672, + "epoch": 0.46927040549622884, + "grad_norm": 1.8306149244308472, + "kl": 0.7158203125, + "learning_rate": 6.750591353401456e-07, + "loss": 0.0869, + "reward": 1.1383928954601288, + "reward_std": 0.26804765686392784, + "rewards/accuracy_reward": 0.1897321566939354, + "rewards/format_reward": 0.9486607611179352, + "step": 1571 + }, + { + "completion_length": 1024.935317993164, + "epoch": 0.46956911358375025, + "grad_norm": 1.107470154762268, + "kl": 0.6279296875, + "learning_rate": 6.746081953346858e-07, + "loss": 0.0454, + "reward": 1.037946492433548, + "reward_std": 0.239873468875885, + "rewards/accuracy_reward": 0.09821429196745157, + "rewards/format_reward": 0.9397321790456772, + "step": 1572 + }, + { + "completion_length": 958.1786193847656, + "epoch": 0.4698678216712717, + "grad_norm": 1.0627208948135376, + "kl": 0.671875, + "learning_rate": 6.741571197677225e-07, + "loss": 0.028, + "reward": 1.100446492433548, + "reward_std": 0.21584536880254745, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.957589328289032, + "step": 1573 + }, + { + "completion_length": 929.8817443847656, + "epoch": 0.4701665297587932, + "grad_norm": 1.0964986085891724, + "kl": 0.43798828125, + "learning_rate": 6.737059091299817e-07, + "loss": 0.0456, + "reward": 1.142857164144516, + "reward_std": 0.19919848442077637, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.964285746216774, + "step": 1574 + }, + { + "completion_length": 989.7210388183594, + "epoch": 0.47046523784631467, + "grad_norm": 0.8266751170158386, + "kl": 0.49609375, + "learning_rate": 6.73254563912336e-07, + "loss": 0.0152, + "reward": 1.035714328289032, + "reward_std": 0.21935118362307549, + "rewards/accuracy_reward": 0.08482143329456449, + "rewards/format_reward": 0.9508928805589676, + "step": 1575 + }, + { + "completion_length": 896.2455749511719, + "epoch": 0.47076394593383614, + "grad_norm": 1.1105914115905762, + "kl": 0.4833984375, + "learning_rate": 6.728030846058052e-07, + "loss": 0.0346, + "reward": 1.100446492433548, + "reward_std": 0.17689124308526516, + "rewards/accuracy_reward": 0.12276786239817739, + "rewards/format_reward": 0.9776786118745804, + "step": 1576 + }, + { + "completion_length": 927.794677734375, + "epoch": 0.4710626540213576, + "grad_norm": 1.62958562374115, + "kl": 0.43994140625, + "learning_rate": 6.723514717015542e-07, + "loss": 0.0328, + "reward": 1.0602678805589676, + "reward_std": 0.17725604958832264, + "rewards/accuracy_reward": 0.10491072130389512, + "rewards/format_reward": 0.9553571939468384, + "step": 1577 + }, + { + "completion_length": 883.0625305175781, + "epoch": 0.4713613621088791, + "grad_norm": 0.8897303938865662, + "kl": 0.46728515625, + "learning_rate": 6.718997256908938e-07, + "loss": 0.0499, + "reward": 1.1026785969734192, + "reward_std": 0.17725633643567562, + "rewards/accuracy_reward": 0.13169643213041127, + "rewards/format_reward": 0.9709821790456772, + "step": 1578 + }, + { + "completion_length": 952.3549346923828, + "epoch": 0.47166007019640055, + "grad_norm": 1.3756965398788452, + "kl": 0.474609375, + "learning_rate": 6.714478470652792e-07, + "loss": 0.0581, + "reward": 1.0111607611179352, + "reward_std": 0.11461924109607935, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.9687500596046448, + "step": 1579 + }, + { + "completion_length": 877.7344055175781, + "epoch": 0.471958778283922, + "grad_norm": 0.9761456251144409, + "kl": 0.6015625, + "learning_rate": 6.709958363163104e-07, + "loss": 0.0476, + "reward": 1.1160714626312256, + "reward_std": 0.2204692494124174, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.957589328289032, + "step": 1580 + }, + { + "completion_length": 925.0781707763672, + "epoch": 0.4722574863714435, + "grad_norm": 1.2412935495376587, + "kl": 0.54052734375, + "learning_rate": 6.705436939357304e-07, + "loss": 0.0431, + "reward": 1.0647321939468384, + "reward_std": 0.22481311205774546, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.9598214775323868, + "step": 1581 + }, + { + "completion_length": 898.3460235595703, + "epoch": 0.47255619445896496, + "grad_norm": 1.5955567359924316, + "kl": 0.49365234375, + "learning_rate": 6.700914204154258e-07, + "loss": 0.0183, + "reward": 1.1473214626312256, + "reward_std": 0.1890416517853737, + "rewards/accuracy_reward": 0.18080357927829027, + "rewards/format_reward": 0.96651791036129, + "step": 1582 + }, + { + "completion_length": 917.904052734375, + "epoch": 0.47285490254648643, + "grad_norm": 0.7819201350212097, + "kl": 0.52001953125, + "learning_rate": 6.696390162474261e-07, + "loss": 0.0147, + "reward": 1.1428571939468384, + "reward_std": 0.20923107862472534, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.9620536118745804, + "step": 1583 + }, + { + "completion_length": 904.6317596435547, + "epoch": 0.4731536106340079, + "grad_norm": 1.0110059976577759, + "kl": 0.56201171875, + "learning_rate": 6.691864819239028e-07, + "loss": 0.0766, + "reward": 1.1227678954601288, + "reward_std": 0.30070775002241135, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9553571790456772, + "step": 1584 + }, + { + "completion_length": 922.9196929931641, + "epoch": 0.4734523187215294, + "grad_norm": 1.2557880878448486, + "kl": 0.759765625, + "learning_rate": 6.687338179371686e-07, + "loss": 0.045, + "reward": 1.1696429252624512, + "reward_std": 0.24111056327819824, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9575893133878708, + "step": 1585 + }, + { + "completion_length": 871.919677734375, + "epoch": 0.47375102680905085, + "grad_norm": 2.9227306842803955, + "kl": 0.7333984375, + "learning_rate": 6.682810247796776e-07, + "loss": 0.0676, + "reward": 1.1607143580913544, + "reward_std": 0.27074624598026276, + "rewards/accuracy_reward": 0.21205358020961285, + "rewards/format_reward": 0.9486607611179352, + "step": 1586 + }, + { + "completion_length": 828.5178833007812, + "epoch": 0.4740497348965723, + "grad_norm": 0.8993777632713318, + "kl": 0.65087890625, + "learning_rate": 6.678281029440243e-07, + "loss": 0.0379, + "reward": 1.0691964626312256, + "reward_std": 0.23400690406560898, + "rewards/accuracy_reward": 0.11830357369035482, + "rewards/format_reward": 0.95089291036129, + "step": 1587 + }, + { + "completion_length": 853.0111999511719, + "epoch": 0.4743484429840938, + "grad_norm": 1.6443688869476318, + "kl": 0.57470703125, + "learning_rate": 6.673750529229437e-07, + "loss": 0.0539, + "reward": 1.1651785969734192, + "reward_std": 0.22156722098588943, + "rewards/accuracy_reward": 0.20089286379516125, + "rewards/format_reward": 0.964285746216774, + "step": 1588 + }, + { + "completion_length": 873.8839874267578, + "epoch": 0.47464715107161526, + "grad_norm": 6.493762493133545, + "kl": 0.73828125, + "learning_rate": 6.669218752093093e-07, + "loss": 0.0408, + "reward": 1.1138393133878708, + "reward_std": 0.18617798388004303, + "rewards/accuracy_reward": 0.15625000675208867, + "rewards/format_reward": 0.9575893431901932, + "step": 1589 + }, + { + "completion_length": 895.9777221679688, + "epoch": 0.47494585915913673, + "grad_norm": 10.131957054138184, + "kl": 0.8330078125, + "learning_rate": 6.664685702961344e-07, + "loss": 0.11, + "reward": 1.0267857611179352, + "reward_std": 0.2643009275197983, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.9330357611179352, + "step": 1590 + }, + { + "completion_length": 936.3549499511719, + "epoch": 0.4752445672466582, + "grad_norm": 4.691150665283203, + "kl": 1.0517578125, + "learning_rate": 6.6601513867657e-07, + "loss": 0.0372, + "reward": 1.066964328289032, + "reward_std": 0.23851927742362022, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.9352678805589676, + "step": 1591 + }, + { + "completion_length": 911.2656707763672, + "epoch": 0.47554327533417967, + "grad_norm": 2.6937859058380127, + "kl": 1.14453125, + "learning_rate": 6.655615808439055e-07, + "loss": 0.0508, + "reward": 1.1361607313156128, + "reward_std": 0.2664684168994427, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9330357611179352, + "step": 1592 + }, + { + "completion_length": 849.279052734375, + "epoch": 0.47584198342170114, + "grad_norm": 4.739770412445068, + "kl": 1.12109375, + "learning_rate": 6.651078972915672e-07, + "loss": 0.1002, + "reward": 1.1607143580913544, + "reward_std": 0.3003136031329632, + "rewards/accuracy_reward": 0.2120535857975483, + "rewards/format_reward": 0.948660746216774, + "step": 1593 + }, + { + "completion_length": 1026.3772583007812, + "epoch": 0.4761406915092226, + "grad_norm": 5.5117597579956055, + "kl": 1.3037109375, + "learning_rate": 6.646540885131185e-07, + "loss": 0.0619, + "reward": 1.0535714626312256, + "reward_std": 0.25857116654515266, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.9352678954601288, + "step": 1594 + }, + { + "completion_length": 898.1540679931641, + "epoch": 0.4764393995967441, + "grad_norm": 1.5383650064468384, + "kl": 0.9599609375, + "learning_rate": 6.642001550022589e-07, + "loss": 0.0883, + "reward": 1.0401786267757416, + "reward_std": 0.33898860961198807, + "rewards/accuracy_reward": 0.11383929196745157, + "rewards/format_reward": 0.9263393431901932, + "step": 1595 + }, + { + "completion_length": 947.8772888183594, + "epoch": 0.47673810768426556, + "grad_norm": 2.2801995277404785, + "kl": 0.9189453125, + "learning_rate": 6.637460972528234e-07, + "loss": 0.0564, + "reward": 1.0714286267757416, + "reward_std": 0.26872605457901955, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.9330357611179352, + "step": 1596 + }, + { + "completion_length": 956.825927734375, + "epoch": 0.477036815771787, + "grad_norm": 4.345261573791504, + "kl": 0.9501953125, + "learning_rate": 6.632919157587825e-07, + "loss": 0.1131, + "reward": 1.0535714775323868, + "reward_std": 0.20694759115576744, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.9508928954601288, + "step": 1597 + }, + { + "completion_length": 899.2790679931641, + "epoch": 0.4773355238593085, + "grad_norm": 1.1641985177993774, + "kl": 1.01953125, + "learning_rate": 6.628376110142407e-07, + "loss": 0.0689, + "reward": 1.1138392984867096, + "reward_std": 0.3048260137438774, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9464286267757416, + "step": 1598 + }, + { + "completion_length": 998.5871124267578, + "epoch": 0.47763423194682997, + "grad_norm": 2.990542411804199, + "kl": 0.923828125, + "learning_rate": 6.623831835134377e-07, + "loss": 0.0258, + "reward": 1.0089286118745804, + "reward_std": 0.2633528485894203, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.9330357611179352, + "step": 1599 + }, + { + "completion_length": 999.1317291259766, + "epoch": 0.47793294003435144, + "grad_norm": 4.938449382781982, + "kl": 1.017578125, + "learning_rate": 6.619286337507457e-07, + "loss": 0.0698, + "reward": 1.084821492433548, + "reward_std": 0.28650131449103355, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9441964775323868, + "step": 1600 + }, + { + "completion_length": 923.8995819091797, + "epoch": 0.4782316481218729, + "grad_norm": 0.8545302748680115, + "kl": 0.5654296875, + "learning_rate": 6.614739622206704e-07, + "loss": 0.0455, + "reward": 1.1227678656578064, + "reward_std": 0.18927692249417305, + "rewards/accuracy_reward": 0.149553582072258, + "rewards/format_reward": 0.9732143133878708, + "step": 1601 + }, + { + "completion_length": 899.1205749511719, + "epoch": 0.4785303562093944, + "grad_norm": 0.8986114859580994, + "kl": 0.61181640625, + "learning_rate": 6.610191694178499e-07, + "loss": 0.0376, + "reward": 1.0468750298023224, + "reward_std": 0.1858142875134945, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.9531250298023224, + "step": 1602 + }, + { + "completion_length": 973.9397735595703, + "epoch": 0.47882906429691585, + "grad_norm": 3.282870054244995, + "kl": 0.603515625, + "learning_rate": 6.605642558370539e-07, + "loss": 0.0651, + "reward": 1.0558036416769028, + "reward_std": 0.2571561373770237, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.9375000447034836, + "step": 1603 + }, + { + "completion_length": 947.0379791259766, + "epoch": 0.4791277723844373, + "grad_norm": 2.5108416080474854, + "kl": 0.5302734375, + "learning_rate": 6.601092219731842e-07, + "loss": 0.0412, + "reward": 1.020089328289032, + "reward_std": 0.199864337220788, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.957589328289032, + "step": 1604 + }, + { + "completion_length": 977.5848693847656, + "epoch": 0.4794264804719588, + "grad_norm": 0.993401050567627, + "kl": 0.5107421875, + "learning_rate": 6.596540683212728e-07, + "loss": 0.0152, + "reward": 1.0803571939468384, + "reward_std": 0.2022182233631611, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.9553571790456772, + "step": 1605 + }, + { + "completion_length": 954.7634582519531, + "epoch": 0.47972518855948026, + "grad_norm": 3.224156141281128, + "kl": 0.697265625, + "learning_rate": 6.591987953764824e-07, + "loss": 0.0206, + "reward": 1.1294643431901932, + "reward_std": 0.21599554643034935, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9397321939468384, + "step": 1606 + }, + { + "completion_length": 919.1362152099609, + "epoch": 0.48002389664700174, + "grad_norm": 10.277273178100586, + "kl": 0.80078125, + "learning_rate": 6.587434036341051e-07, + "loss": 0.0649, + "reward": 1.1808036267757416, + "reward_std": 0.2913231775164604, + "rewards/accuracy_reward": 0.2343750149011612, + "rewards/format_reward": 0.9464285969734192, + "step": 1607 + }, + { + "completion_length": 1022.3058624267578, + "epoch": 0.4803226047345232, + "grad_norm": 7.511974334716797, + "kl": 0.85546875, + "learning_rate": 6.582878935895627e-07, + "loss": 0.0528, + "reward": 1.0892857611179352, + "reward_std": 0.2617523558437824, + "rewards/accuracy_reward": 0.13616072107106447, + "rewards/format_reward": 0.9531250447034836, + "step": 1608 + }, + { + "completion_length": 851.7411041259766, + "epoch": 0.4806213128220447, + "grad_norm": 23.2658748626709, + "kl": 1.3896484375, + "learning_rate": 6.578322657384055e-07, + "loss": 0.0687, + "reward": 1.1071428954601288, + "reward_std": 0.2490231841802597, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9531250447034836, + "step": 1609 + }, + { + "completion_length": 916.9553985595703, + "epoch": 0.48092002090956615, + "grad_norm": 1.5326306819915771, + "kl": 0.66552734375, + "learning_rate": 6.573765205763118e-07, + "loss": 0.0513, + "reward": 1.0290178954601288, + "reward_std": 0.20221833512187004, + "rewards/accuracy_reward": 0.08258928754366934, + "rewards/format_reward": 0.9464286118745804, + "step": 1610 + }, + { + "completion_length": 898.3906555175781, + "epoch": 0.4812187289970876, + "grad_norm": 0.7604172229766846, + "kl": 0.4638671875, + "learning_rate": 6.569206585990878e-07, + "loss": 0.0419, + "reward": 1.2053571939468384, + "reward_std": 0.3051038309931755, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.9553571790456772, + "step": 1611 + }, + { + "completion_length": 962.6562957763672, + "epoch": 0.4815174370846091, + "grad_norm": 2.079852342605591, + "kl": 0.6708984375, + "learning_rate": 6.564646803026666e-07, + "loss": 0.0299, + "reward": 1.1026785969734192, + "reward_std": 0.25801847875118256, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9441964626312256, + "step": 1612 + }, + { + "completion_length": 952.9330902099609, + "epoch": 0.48181614517213056, + "grad_norm": 1.9244920015335083, + "kl": 0.53515625, + "learning_rate": 6.560085861831078e-07, + "loss": 0.027, + "reward": 1.0758928954601288, + "reward_std": 0.26672101952135563, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9486607611179352, + "step": 1613 + }, + { + "completion_length": 1049.5692596435547, + "epoch": 0.48211485325965203, + "grad_norm": 2.4598705768585205, + "kl": 0.4814453125, + "learning_rate": 6.555523767365973e-07, + "loss": 0.0462, + "reward": 1.1495535969734192, + "reward_std": 0.23234929144382477, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.9620536118745804, + "step": 1614 + }, + { + "completion_length": 875.9888916015625, + "epoch": 0.48241356134717345, + "grad_norm": 1.3709720373153687, + "kl": 0.47900390625, + "learning_rate": 6.55096052459446e-07, + "loss": 0.037, + "reward": 1.1227679252624512, + "reward_std": 0.21447212621569633, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.9687500298023224, + "step": 1615 + }, + { + "completion_length": 963.8281555175781, + "epoch": 0.4827122694346949, + "grad_norm": 2.7811124324798584, + "kl": 0.59619140625, + "learning_rate": 6.546396138480904e-07, + "loss": 0.0269, + "reward": 1.069196492433548, + "reward_std": 0.25732487440109253, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.9620536267757416, + "step": 1616 + }, + { + "completion_length": 946.1272735595703, + "epoch": 0.4830109775222164, + "grad_norm": 5.7406158447265625, + "kl": 0.50537109375, + "learning_rate": 6.541830613990904e-07, + "loss": 0.0378, + "reward": 1.1674107611179352, + "reward_std": 0.17855428531765938, + "rewards/accuracy_reward": 0.20312500977888703, + "rewards/format_reward": 0.9642857611179352, + "step": 1617 + }, + { + "completion_length": 970.1897888183594, + "epoch": 0.48330968560973786, + "grad_norm": 1.0500949621200562, + "kl": 0.4228515625, + "learning_rate": 6.53726395609131e-07, + "loss": 0.0393, + "reward": 1.1361607611179352, + "reward_std": 0.2376963272690773, + "rewards/accuracy_reward": 0.1808035746216774, + "rewards/format_reward": 0.9553571790456772, + "step": 1618 + }, + { + "completion_length": 983.1763916015625, + "epoch": 0.48360839369725933, + "grad_norm": 1.5010682344436646, + "kl": 0.4560546875, + "learning_rate": 6.532696169750192e-07, + "loss": 0.0534, + "reward": 1.1852679252624512, + "reward_std": 0.2154758721590042, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.9687500447034836, + "step": 1619 + }, + { + "completion_length": 899.075927734375, + "epoch": 0.4839071017847808, + "grad_norm": 4.896778106689453, + "kl": 0.4755859375, + "learning_rate": 6.528127259936856e-07, + "loss": 0.073, + "reward": 1.1339286267757416, + "reward_std": 0.2756292298436165, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.9598214626312256, + "step": 1620 + }, + { + "completion_length": 906.6674499511719, + "epoch": 0.4842058098723023, + "grad_norm": 1.5834907293319702, + "kl": 0.47705078125, + "learning_rate": 6.52355723162183e-07, + "loss": 0.0414, + "reward": 1.0937500596046448, + "reward_std": 0.2691485248506069, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.957589328289032, + "step": 1621 + }, + { + "completion_length": 946.6161346435547, + "epoch": 0.48450451795982374, + "grad_norm": 1.3411986827850342, + "kl": 0.5517578125, + "learning_rate": 6.518986089776854e-07, + "loss": 0.0634, + "reward": 1.1026786267757416, + "reward_std": 0.26127446070313454, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9464286267757416, + "step": 1622 + }, + { + "completion_length": 843.2902069091797, + "epoch": 0.4848032260473452, + "grad_norm": 1.688444972038269, + "kl": 0.4482421875, + "learning_rate": 6.514413839374886e-07, + "loss": 0.0461, + "reward": 1.238839328289032, + "reward_std": 0.22970304265618324, + "rewards/accuracy_reward": 0.2723214440047741, + "rewards/format_reward": 0.9665178954601288, + "step": 1623 + }, + { + "completion_length": 876.7567443847656, + "epoch": 0.4851019341348667, + "grad_norm": 2.338172435760498, + "kl": 0.4619140625, + "learning_rate": 6.509840485390081e-07, + "loss": 0.0829, + "reward": 1.1339285969734192, + "reward_std": 0.2534158043563366, + "rewards/accuracy_reward": 0.1741071566939354, + "rewards/format_reward": 0.9598214626312256, + "step": 1624 + }, + { + "completion_length": 964.9107513427734, + "epoch": 0.48540064222238816, + "grad_norm": 4.484687328338623, + "kl": 0.9345703125, + "learning_rate": 6.505266032797805e-07, + "loss": 0.058, + "reward": 1.006696492433548, + "reward_std": 0.20949107967317104, + "rewards/accuracy_reward": 0.04910714481957257, + "rewards/format_reward": 0.957589328289032, + "step": 1625 + }, + { + "completion_length": 840.2768249511719, + "epoch": 0.4856993503099096, + "grad_norm": 3.2809481620788574, + "kl": 0.6748046875, + "learning_rate": 6.500690486574611e-07, + "loss": 0.0677, + "reward": 1.1696429252624512, + "reward_std": 0.23573090508580208, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.9620535969734192, + "step": 1626 + }, + { + "completion_length": 948.7009429931641, + "epoch": 0.4859980583974311, + "grad_norm": 8.670858383178711, + "kl": 0.9306640625, + "learning_rate": 6.496113851698247e-07, + "loss": 0.0929, + "reward": 1.087053656578064, + "reward_std": 0.24951809272170067, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.964285746216774, + "step": 1627 + }, + { + "completion_length": 941.1027069091797, + "epoch": 0.48629676648495257, + "grad_norm": 2.5293092727661133, + "kl": 0.705078125, + "learning_rate": 6.49153613314764e-07, + "loss": 0.061, + "reward": 1.0982143580913544, + "reward_std": 0.2286439687013626, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9665178954601288, + "step": 1628 + }, + { + "completion_length": 873.2969055175781, + "epoch": 0.48659547457247404, + "grad_norm": 1.5617649555206299, + "kl": 0.68798828125, + "learning_rate": 6.486957335902904e-07, + "loss": 0.0607, + "reward": 1.20089291036129, + "reward_std": 0.3508704826235771, + "rewards/accuracy_reward": 0.25223216135054827, + "rewards/format_reward": 0.9486607611179352, + "step": 1629 + }, + { + "completion_length": 905.8393402099609, + "epoch": 0.4868941826599955, + "grad_norm": 5.062297344207764, + "kl": 0.80615234375, + "learning_rate": 6.482377464945316e-07, + "loss": 0.0486, + "reward": 1.0714286267757416, + "reward_std": 0.2017452623695135, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.9665178954601288, + "step": 1630 + }, + { + "completion_length": 833.3951263427734, + "epoch": 0.487192890747517, + "grad_norm": 2.824976682662964, + "kl": 0.57958984375, + "learning_rate": 6.477796525257331e-07, + "loss": 0.0755, + "reward": 1.0602679252624512, + "reward_std": 0.2505458891391754, + "rewards/accuracy_reward": 0.11160715110599995, + "rewards/format_reward": 0.9486607611179352, + "step": 1631 + }, + { + "completion_length": 918.4888763427734, + "epoch": 0.48749159883503845, + "grad_norm": 3.312974452972412, + "kl": 0.6650390625, + "learning_rate": 6.473214521822561e-07, + "loss": 0.0519, + "reward": 1.1004464626312256, + "reward_std": 0.21534020081162453, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.948660746216774, + "step": 1632 + }, + { + "completion_length": 835.0111999511719, + "epoch": 0.4877903069225599, + "grad_norm": 2.3640711307525635, + "kl": 0.5947265625, + "learning_rate": 6.468631459625775e-07, + "loss": 0.0282, + "reward": 1.0267857313156128, + "reward_std": 0.23553257435560226, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.95089291036129, + "step": 1633 + }, + { + "completion_length": 835.5781555175781, + "epoch": 0.4880890150100814, + "grad_norm": 3.911334276199341, + "kl": 0.7705078125, + "learning_rate": 6.464047343652898e-07, + "loss": 0.0842, + "reward": 1.1316964626312256, + "reward_std": 0.31037959456443787, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.933035746216774, + "step": 1634 + }, + { + "completion_length": 807.091552734375, + "epoch": 0.48838772309760287, + "grad_norm": 6.338911533355713, + "kl": 0.60107421875, + "learning_rate": 6.459462178890998e-07, + "loss": 0.0744, + "reward": 1.1049107611179352, + "reward_std": 0.23132213950157166, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9575893133878708, + "step": 1635 + }, + { + "completion_length": 863.591552734375, + "epoch": 0.48868643118512434, + "grad_norm": 1.2957234382629395, + "kl": 0.720703125, + "learning_rate": 6.454875970328285e-07, + "loss": 0.0247, + "reward": 1.1852678954601288, + "reward_std": 0.23709243535995483, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.9665178954601288, + "step": 1636 + }, + { + "completion_length": 850.794677734375, + "epoch": 0.4889851392726458, + "grad_norm": 1.3113187551498413, + "kl": 0.876953125, + "learning_rate": 6.450288722954103e-07, + "loss": 0.0522, + "reward": 1.0468750447034836, + "reward_std": 0.25344227999448776, + "rewards/accuracy_reward": 0.10491071920841932, + "rewards/format_reward": 0.941964328289032, + "step": 1637 + }, + { + "completion_length": 849.2857513427734, + "epoch": 0.4892838473601673, + "grad_norm": 1.4692739248275757, + "kl": 0.7509765625, + "learning_rate": 6.44570044175893e-07, + "loss": 0.0777, + "reward": 1.0669643431901932, + "reward_std": 0.21887775138020515, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.9620536118745804, + "step": 1638 + }, + { + "completion_length": 917.5156707763672, + "epoch": 0.48958255544768875, + "grad_norm": 2.714418888092041, + "kl": 0.9384765625, + "learning_rate": 6.441111131734364e-07, + "loss": 0.1034, + "reward": 1.0892857313156128, + "reward_std": 0.25401975214481354, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.941964328289032, + "step": 1639 + }, + { + "completion_length": 850.4977874755859, + "epoch": 0.4898812635352102, + "grad_norm": 4.516895294189453, + "kl": 1.08447265625, + "learning_rate": 6.436520797873128e-07, + "loss": 0.0892, + "reward": 1.0714286267757416, + "reward_std": 0.24326537176966667, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.9642857611179352, + "step": 1640 + }, + { + "completion_length": 858.1428833007812, + "epoch": 0.4901799716227317, + "grad_norm": 2.14388108253479, + "kl": 0.841796875, + "learning_rate": 6.431929445169051e-07, + "loss": 0.068, + "reward": 1.176339328289032, + "reward_std": 0.18862559087574482, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.973214328289032, + "step": 1641 + }, + { + "completion_length": 876.7053985595703, + "epoch": 0.49047867971025316, + "grad_norm": 1.7068257331848145, + "kl": 0.8662109375, + "learning_rate": 6.427337078617076e-07, + "loss": 0.1264, + "reward": 1.162946492433548, + "reward_std": 0.2492646798491478, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9665178954601288, + "step": 1642 + }, + { + "completion_length": 905.9754943847656, + "epoch": 0.49077738779777463, + "grad_norm": 2.110865831375122, + "kl": 0.958984375, + "learning_rate": 6.422743703213248e-07, + "loss": 0.0604, + "reward": 1.1272321790456772, + "reward_std": 0.32294458895921707, + "rewards/accuracy_reward": 0.209821441443637, + "rewards/format_reward": 0.9174107611179352, + "step": 1643 + }, + { + "completion_length": 821.8080749511719, + "epoch": 0.4910760958852961, + "grad_norm": 3.064762592315674, + "kl": 0.9052734375, + "learning_rate": 6.41814932395471e-07, + "loss": 0.1164, + "reward": 1.1183036416769028, + "reward_std": 0.2811683900654316, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.9375000298023224, + "step": 1644 + }, + { + "completion_length": 976.4464569091797, + "epoch": 0.4913748039728176, + "grad_norm": 2.099128007888794, + "kl": 0.66552734375, + "learning_rate": 6.413553945839696e-07, + "loss": 0.0296, + "reward": 0.9888393133878708, + "reward_std": 0.2269076406955719, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.9441964775323868, + "step": 1645 + }, + { + "completion_length": 844.2522735595703, + "epoch": 0.49167351206033905, + "grad_norm": 2.7802391052246094, + "kl": 0.669921875, + "learning_rate": 6.408957573867527e-07, + "loss": 0.0735, + "reward": 1.09151791036129, + "reward_std": 0.204272098839283, + "rewards/accuracy_reward": 0.13392858183942735, + "rewards/format_reward": 0.9575893133878708, + "step": 1646 + }, + { + "completion_length": 784.0982360839844, + "epoch": 0.4919722201478605, + "grad_norm": 2.0621795654296875, + "kl": 0.8232421875, + "learning_rate": 6.404360213038605e-07, + "loss": 0.0578, + "reward": 1.1517857611179352, + "reward_std": 0.24128295108675957, + "rewards/accuracy_reward": 0.2031250123400241, + "rewards/format_reward": 0.948660746216774, + "step": 1647 + }, + { + "completion_length": 770.7076263427734, + "epoch": 0.492270928235382, + "grad_norm": 2.99609375, + "kl": 0.77392578125, + "learning_rate": 6.399761868354409e-07, + "loss": 0.097, + "reward": 1.1718750596046448, + "reward_std": 0.2781372219324112, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.9553571939468384, + "step": 1648 + }, + { + "completion_length": 847.7031555175781, + "epoch": 0.49256963632290346, + "grad_norm": 0.9845255613327026, + "kl": 0.697265625, + "learning_rate": 6.395162544817484e-07, + "loss": 0.0547, + "reward": 1.1696429252624512, + "reward_std": 0.2532978244125843, + "rewards/accuracy_reward": 0.2142857275903225, + "rewards/format_reward": 0.955357164144516, + "step": 1649 + }, + { + "completion_length": 875.4263763427734, + "epoch": 0.49286834441042493, + "grad_norm": 2.293989896774292, + "kl": 1.0849609375, + "learning_rate": 6.390562247431449e-07, + "loss": 0.0129, + "reward": 1.0513393580913544, + "reward_std": 0.2564495764672756, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.95089291036129, + "step": 1650 + }, + { + "completion_length": 805.7812805175781, + "epoch": 0.4931670524979464, + "grad_norm": 10.150471687316895, + "kl": 1.392578125, + "learning_rate": 6.385960981200969e-07, + "loss": 0.1149, + "reward": 1.0647321939468384, + "reward_std": 0.31284137815237045, + "rewards/accuracy_reward": 0.14285715040750802, + "rewards/format_reward": 0.9218750447034836, + "step": 1651 + }, + { + "completion_length": 833.669677734375, + "epoch": 0.49346576058546787, + "grad_norm": 8.640576362609863, + "kl": 1.412109375, + "learning_rate": 6.381358751131778e-07, + "loss": 0.1155, + "reward": 1.1227678954601288, + "reward_std": 0.28690775111317635, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9218750447034836, + "step": 1652 + }, + { + "completion_length": 850.5491485595703, + "epoch": 0.49376446867298934, + "grad_norm": 33.44962692260742, + "kl": 1.7412109375, + "learning_rate": 6.376755562230646e-07, + "loss": 0.1291, + "reward": 1.058035746216774, + "reward_std": 0.3243578001856804, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9285714775323868, + "step": 1653 + }, + { + "completion_length": 917.9866638183594, + "epoch": 0.4940631767605108, + "grad_norm": 1.2219955921173096, + "kl": 1.1484375, + "learning_rate": 6.372151419505397e-07, + "loss": 0.0388, + "reward": 1.0290178805589676, + "reward_std": 0.2811529189348221, + "rewards/accuracy_reward": 0.09375000232830644, + "rewards/format_reward": 0.93526791036129, + "step": 1654 + }, + { + "completion_length": 784.0580749511719, + "epoch": 0.4943618848480323, + "grad_norm": 1.9419023990631104, + "kl": 0.77490234375, + "learning_rate": 6.367546327964882e-07, + "loss": 0.047, + "reward": 1.0781250596046448, + "reward_std": 0.22413619700819254, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.9598214626312256, + "step": 1655 + }, + { + "completion_length": 853.5223541259766, + "epoch": 0.49466059293555376, + "grad_norm": 1.5953052043914795, + "kl": 0.7158203125, + "learning_rate": 6.362940292618989e-07, + "loss": 0.0469, + "reward": 1.0892857760190964, + "reward_std": 0.27344442903995514, + "rewards/accuracy_reward": 0.14508928824216127, + "rewards/format_reward": 0.9441964626312256, + "step": 1656 + }, + { + "completion_length": 905.091552734375, + "epoch": 0.4949593010230752, + "grad_norm": 3.1635901927948, + "kl": 0.75, + "learning_rate": 6.358333318478637e-07, + "loss": 0.0146, + "reward": 1.10714291036129, + "reward_std": 0.24699899554252625, + "rewards/accuracy_reward": 0.15625001164153218, + "rewards/format_reward": 0.9508928805589676, + "step": 1657 + }, + { + "completion_length": 767.4285888671875, + "epoch": 0.49525800911059664, + "grad_norm": 1.015831470489502, + "kl": 0.7685546875, + "learning_rate": 6.35372541055576e-07, + "loss": 0.0259, + "reward": 1.1517857611179352, + "reward_std": 0.2563938722014427, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9508928805589676, + "step": 1658 + }, + { + "completion_length": 897.2902221679688, + "epoch": 0.4955567171981181, + "grad_norm": 1.8977545499801636, + "kl": 0.970703125, + "learning_rate": 6.349116573863309e-07, + "loss": 0.0591, + "reward": 1.0334821790456772, + "reward_std": 0.284884799271822, + "rewards/accuracy_reward": 0.09151786286383867, + "rewards/format_reward": 0.941964328289032, + "step": 1659 + }, + { + "completion_length": 814.6495971679688, + "epoch": 0.4958554252856396, + "grad_norm": 1.0449068546295166, + "kl": 0.7177734375, + "learning_rate": 6.344506813415249e-07, + "loss": 0.0414, + "reward": 1.093750074505806, + "reward_std": 0.22372394800186157, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.957589328289032, + "step": 1660 + }, + { + "completion_length": 908.3013916015625, + "epoch": 0.49615413337316105, + "grad_norm": 0.7738348841667175, + "kl": 0.8115234375, + "learning_rate": 6.339896134226546e-07, + "loss": 0.0623, + "reward": 1.0580357611179352, + "reward_std": 0.21993083134293556, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.9464286118745804, + "step": 1661 + }, + { + "completion_length": 904.5335235595703, + "epoch": 0.4964528414606825, + "grad_norm": 5.494036674499512, + "kl": 1.19091796875, + "learning_rate": 6.335284541313168e-07, + "loss": 0.0573, + "reward": 1.0714286267757416, + "reward_std": 0.2475135736167431, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9441964775323868, + "step": 1662 + }, + { + "completion_length": 873.6362152099609, + "epoch": 0.496751549548204, + "grad_norm": 1.72533118724823, + "kl": 0.7109375, + "learning_rate": 6.330672039692077e-07, + "loss": 0.053, + "reward": 1.1049107909202576, + "reward_std": 0.2129802703857422, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.973214328289032, + "step": 1663 + }, + { + "completion_length": 926.7545166015625, + "epoch": 0.49705025763572547, + "grad_norm": 1.8580231666564941, + "kl": 1.068359375, + "learning_rate": 6.326058634381219e-07, + "loss": 0.0185, + "reward": 1.0267857313156128, + "reward_std": 0.27205023542046547, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.9375000447034836, + "step": 1664 + }, + { + "completion_length": 946.7366638183594, + "epoch": 0.49734896572324694, + "grad_norm": 2.0207479000091553, + "kl": 0.673828125, + "learning_rate": 6.321444330399531e-07, + "loss": 0.0614, + "reward": 1.0781250596046448, + "reward_std": 0.29384230822324753, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.941964328289032, + "step": 1665 + }, + { + "completion_length": 980.091552734375, + "epoch": 0.4976476738107684, + "grad_norm": 1.5205951929092407, + "kl": 0.65625, + "learning_rate": 6.316829132766921e-07, + "loss": 0.0705, + "reward": 1.084821492433548, + "reward_std": 0.29449404031038284, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.9486607611179352, + "step": 1666 + }, + { + "completion_length": 915.0960083007812, + "epoch": 0.4979463818982899, + "grad_norm": 2.0617735385894775, + "kl": 0.7666015625, + "learning_rate": 6.312213046504273e-07, + "loss": 0.0791, + "reward": 1.1272321939468384, + "reward_std": 0.2937290407717228, + "rewards/accuracy_reward": 0.19196429196745157, + "rewards/format_reward": 0.9352678954601288, + "step": 1667 + }, + { + "completion_length": 901.7567291259766, + "epoch": 0.49824508998581135, + "grad_norm": 1.013335943222046, + "kl": 0.7607421875, + "learning_rate": 6.307596076633434e-07, + "loss": 0.0386, + "reward": 1.0089285969734192, + "reward_std": 0.28584547340869904, + "rewards/accuracy_reward": 0.07589286286383867, + "rewards/format_reward": 0.9330357611179352, + "step": 1668 + }, + { + "completion_length": 868.4062805175781, + "epoch": 0.4985437980733328, + "grad_norm": 2.472205638885498, + "kl": 0.55859375, + "learning_rate": 6.302978228177221e-07, + "loss": 0.0477, + "reward": 1.0357143580913544, + "reward_std": 0.28170109540224075, + "rewards/accuracy_reward": 0.10491071734577417, + "rewards/format_reward": 0.9308036118745804, + "step": 1669 + }, + { + "completion_length": 838.6607666015625, + "epoch": 0.4988425061608543, + "grad_norm": 0.8320043683052063, + "kl": 0.67578125, + "learning_rate": 6.298359506159392e-07, + "loss": 0.0408, + "reward": 1.066964328289032, + "reward_std": 0.22088447958230972, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.9464286118745804, + "step": 1670 + }, + { + "completion_length": 865.3281707763672, + "epoch": 0.49914121424837576, + "grad_norm": 4.125886917114258, + "kl": 0.8388671875, + "learning_rate": 6.293739915604668e-07, + "loss": 0.0136, + "reward": 1.053571492433548, + "reward_std": 0.2348710559308529, + "rewards/accuracy_reward": 0.0915178582072258, + "rewards/format_reward": 0.9620536118745804, + "step": 1671 + }, + { + "completion_length": 880.6295318603516, + "epoch": 0.49943992233589724, + "grad_norm": 2.089329481124878, + "kl": 0.61083984375, + "learning_rate": 6.289119461538712e-07, + "loss": 0.0624, + "reward": 1.066964328289032, + "reward_std": 0.29562390223145485, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.941964328289032, + "step": 1672 + }, + { + "completion_length": 889.6049499511719, + "epoch": 0.4997386304234187, + "grad_norm": 3.1235978603363037, + "kl": 0.83984375, + "learning_rate": 6.284498148988123e-07, + "loss": 0.0378, + "reward": 1.0558036267757416, + "reward_std": 0.15974864177405834, + "rewards/accuracy_reward": 0.09151786239817739, + "rewards/format_reward": 0.9642857611179352, + "step": 1673 + }, + { + "completion_length": 858.3393096923828, + "epoch": 0.5000373385109402, + "grad_norm": 1.0893720388412476, + "kl": 0.6669921875, + "learning_rate": 6.279875982980439e-07, + "loss": 0.062, + "reward": 1.1272322237491608, + "reward_std": 0.25879086181521416, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9419643431901932, + "step": 1674 + }, + { + "completion_length": 900.3080749511719, + "epoch": 0.5003360465984616, + "grad_norm": 2.9381134510040283, + "kl": 0.68359375, + "learning_rate": 6.275252968544119e-07, + "loss": 0.0601, + "reward": 1.073660746216774, + "reward_std": 0.34112562984228134, + "rewards/accuracy_reward": 0.14955358020961285, + "rewards/format_reward": 0.9241071790456772, + "step": 1675 + }, + { + "completion_length": 850.5268249511719, + "epoch": 0.5006347546859832, + "grad_norm": 1.7672570943832397, + "kl": 0.70703125, + "learning_rate": 6.270629110708554e-07, + "loss": 0.0286, + "reward": 1.0825893133878708, + "reward_std": 0.21277425438165665, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.9531250447034836, + "step": 1676 + }, + { + "completion_length": 764.0580749511719, + "epoch": 0.5009334627735046, + "grad_norm": 1.1036205291748047, + "kl": 0.59521484375, + "learning_rate": 6.266004414504044e-07, + "loss": 0.0333, + "reward": 1.1272322237491608, + "reward_std": 0.2704494372010231, + "rewards/accuracy_reward": 0.16741072665899992, + "rewards/format_reward": 0.9598214775323868, + "step": 1677 + }, + { + "completion_length": 894.5603179931641, + "epoch": 0.5012321708610261, + "grad_norm": 5.929014205932617, + "kl": 0.5830078125, + "learning_rate": 6.261378884961811e-07, + "loss": 0.0394, + "reward": 1.0736607611179352, + "reward_std": 0.20153237134218216, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.9508928954601288, + "step": 1678 + }, + { + "completion_length": 781.3504791259766, + "epoch": 0.5015308789485475, + "grad_norm": 0.7944934368133545, + "kl": 0.484375, + "learning_rate": 6.256752527113973e-07, + "loss": 0.0324, + "reward": 1.0401785969734192, + "reward_std": 0.1653239093720913, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.9709821790456772, + "step": 1679 + }, + { + "completion_length": 915.2433624267578, + "epoch": 0.501829587036069, + "grad_norm": 1.67527174949646, + "kl": 0.7060546875, + "learning_rate": 6.252125345993555e-07, + "loss": 0.0236, + "reward": 1.1294643580913544, + "reward_std": 0.3047637827694416, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.95089291036129, + "step": 1680 + }, + { + "completion_length": 891.107177734375, + "epoch": 0.5021282951235905, + "grad_norm": 1.7808741331100464, + "kl": 1.0234375, + "learning_rate": 6.247497346634475e-07, + "loss": 0.0579, + "reward": 1.1473214626312256, + "reward_std": 0.256815779954195, + "rewards/accuracy_reward": 0.1986607275903225, + "rewards/format_reward": 0.9486607611179352, + "step": 1681 + }, + { + "completion_length": 861.8393249511719, + "epoch": 0.5024270032111119, + "grad_norm": 1.3623363971710205, + "kl": 0.8701171875, + "learning_rate": 6.242868534071547e-07, + "loss": 0.0836, + "reward": 1.0848214626312256, + "reward_std": 0.27072348445653915, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.95089291036129, + "step": 1682 + }, + { + "completion_length": 847.3125457763672, + "epoch": 0.5027257112986334, + "grad_norm": 4.361104488372803, + "kl": 0.67919921875, + "learning_rate": 6.238238913340461e-07, + "loss": 0.056, + "reward": 1.1227679252624512, + "reward_std": 0.2998109385371208, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.941964328289032, + "step": 1683 + }, + { + "completion_length": 857.9174499511719, + "epoch": 0.5030244193861548, + "grad_norm": 4.856330871582031, + "kl": 0.8857421875, + "learning_rate": 6.233608489477793e-07, + "loss": 0.0641, + "reward": 1.1852679252624512, + "reward_std": 0.23963122069835663, + "rewards/accuracy_reward": 0.2299107275903225, + "rewards/format_reward": 0.9553571790456772, + "step": 1684 + }, + { + "completion_length": 849.9107360839844, + "epoch": 0.5033231274736764, + "grad_norm": 14.140524864196777, + "kl": 1.134765625, + "learning_rate": 6.228977267520991e-07, + "loss": 0.0804, + "reward": 1.0647321939468384, + "reward_std": 0.2620682157576084, + "rewards/accuracy_reward": 0.12053571920841932, + "rewards/format_reward": 0.9441964626312256, + "step": 1685 + }, + { + "completion_length": 806.3616333007812, + "epoch": 0.5036218355611978, + "grad_norm": 7.564581871032715, + "kl": 0.6171875, + "learning_rate": 6.224345252508368e-07, + "loss": 0.0469, + "reward": 1.1093750596046448, + "reward_std": 0.20283968932926655, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9687500447034836, + "step": 1686 + }, + { + "completion_length": 906.0469055175781, + "epoch": 0.5039205436487193, + "grad_norm": 1.7499284744262695, + "kl": 0.6533203125, + "learning_rate": 6.219712449479105e-07, + "loss": 0.0808, + "reward": 1.0334821790456772, + "reward_std": 0.25708237290382385, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.9441964775323868, + "step": 1687 + }, + { + "completion_length": 740.3370971679688, + "epoch": 0.5042192517362407, + "grad_norm": 1.0043660402297974, + "kl": 0.58544921875, + "learning_rate": 6.215078863473234e-07, + "loss": 0.0235, + "reward": 1.1093750298023224, + "reward_std": 0.20950070396065712, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.9553571790456772, + "step": 1688 + }, + { + "completion_length": 817.3236999511719, + "epoch": 0.5045179598237622, + "grad_norm": 2.2838847637176514, + "kl": 0.64892578125, + "learning_rate": 6.210444499531647e-07, + "loss": 0.0363, + "reward": 1.0781250596046448, + "reward_std": 0.2366243079304695, + "rewards/accuracy_reward": 0.13616071874275804, + "rewards/format_reward": 0.941964328289032, + "step": 1689 + }, + { + "completion_length": 797.4174499511719, + "epoch": 0.5048166679112837, + "grad_norm": 3.3412506580352783, + "kl": 0.5966796875, + "learning_rate": 6.205809362696076e-07, + "loss": 0.0379, + "reward": 1.098214328289032, + "reward_std": 0.23633117228746414, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9575893431901932, + "step": 1690 + }, + { + "completion_length": 916.9844207763672, + "epoch": 0.5051153759988052, + "grad_norm": 2.164735794067383, + "kl": 0.59814453125, + "learning_rate": 6.201173458009093e-07, + "loss": 0.0372, + "reward": 1.0736607611179352, + "reward_std": 0.21972136944532394, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.9375000447034836, + "step": 1691 + }, + { + "completion_length": 867.4665679931641, + "epoch": 0.5054140840863266, + "grad_norm": 2.1622390747070312, + "kl": 0.974609375, + "learning_rate": 6.196536790514112e-07, + "loss": 0.045, + "reward": 1.0937500894069672, + "reward_std": 0.26021507382392883, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.9553571790456772, + "step": 1692 + }, + { + "completion_length": 921.0000457763672, + "epoch": 0.5057127921738481, + "grad_norm": 1.090827226638794, + "kl": 0.7744140625, + "learning_rate": 6.19189936525537e-07, + "loss": 0.0088, + "reward": 1.031250074505806, + "reward_std": 0.20685024000704288, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.9620536118745804, + "step": 1693 + }, + { + "completion_length": 836.4911041259766, + "epoch": 0.5060115002613695, + "grad_norm": 4.764147758483887, + "kl": 0.91259765625, + "learning_rate": 6.187261187277931e-07, + "loss": 0.0609, + "reward": 1.2142857611179352, + "reward_std": 0.28267380595207214, + "rewards/accuracy_reward": 0.2656250149011612, + "rewards/format_reward": 0.948660746216774, + "step": 1694 + }, + { + "completion_length": 852.7433319091797, + "epoch": 0.5063102083488911, + "grad_norm": 5.561146259307861, + "kl": 1.2021484375, + "learning_rate": 6.18262226162768e-07, + "loss": 0.0762, + "reward": 1.1316964626312256, + "reward_std": 0.2809307500720024, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9508928805589676, + "step": 1695 + }, + { + "completion_length": 860.7455749511719, + "epoch": 0.5066089164364125, + "grad_norm": 1.0545357465744019, + "kl": 0.7060546875, + "learning_rate": 6.177982593351313e-07, + "loss": 0.0352, + "reward": 1.0937500596046448, + "reward_std": 0.21027380973100662, + "rewards/accuracy_reward": 0.1294642868451774, + "rewards/format_reward": 0.9642857611179352, + "step": 1696 + }, + { + "completion_length": 877.6451263427734, + "epoch": 0.506907624523934, + "grad_norm": 1.2878005504608154, + "kl": 0.80859375, + "learning_rate": 6.173342187496333e-07, + "loss": 0.0423, + "reward": 1.1406250596046448, + "reward_std": 0.29392214491963387, + "rewards/accuracy_reward": 0.20312500931322575, + "rewards/format_reward": 0.9375000447034836, + "step": 1697 + }, + { + "completion_length": 919.2076263427734, + "epoch": 0.5072063326114554, + "grad_norm": 2.708094835281372, + "kl": 0.9892578125, + "learning_rate": 6.168701049111048e-07, + "loss": 0.0676, + "reward": 1.0669643580913544, + "reward_std": 0.2842428646981716, + "rewards/accuracy_reward": 0.12946429033763707, + "rewards/format_reward": 0.9375000447034836, + "step": 1698 + }, + { + "completion_length": 840.9777221679688, + "epoch": 0.507505040698977, + "grad_norm": 1.0815415382385254, + "kl": 0.6865234375, + "learning_rate": 6.164059183244562e-07, + "loss": 0.0508, + "reward": 1.1250000298023224, + "reward_std": 0.31171802431344986, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.941964328289032, + "step": 1699 + }, + { + "completion_length": 869.7009124755859, + "epoch": 0.5078037487864984, + "grad_norm": 1.6919481754302979, + "kl": 0.7373046875, + "learning_rate": 6.159416594946769e-07, + "loss": 0.0219, + "reward": 1.0446429252624512, + "reward_std": 0.22719286009669304, + "rewards/accuracy_reward": 0.08928571688011289, + "rewards/format_reward": 0.9553571790456772, + "step": 1700 + }, + { + "completion_length": 853.3616485595703, + "epoch": 0.5081024568740199, + "grad_norm": 2.7223803997039795, + "kl": 0.998046875, + "learning_rate": 6.15477328926835e-07, + "loss": 0.0381, + "reward": 1.1741071939468384, + "reward_std": 0.2555311322212219, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9419643133878708, + "step": 1701 + }, + { + "completion_length": 811.3036193847656, + "epoch": 0.5084011649615413, + "grad_norm": 2.89306902885437, + "kl": 0.78955078125, + "learning_rate": 6.150129271260768e-07, + "loss": 0.061, + "reward": 1.0982143580913544, + "reward_std": 0.24873012490570545, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9531250447034836, + "step": 1702 + }, + { + "completion_length": 890.7388763427734, + "epoch": 0.5086998730490628, + "grad_norm": 2.3135874271392822, + "kl": 0.7998046875, + "learning_rate": 6.145484545976257e-07, + "loss": 0.0297, + "reward": 1.1093750596046448, + "reward_std": 0.21459878608584404, + "rewards/accuracy_reward": 0.16294643748551607, + "rewards/format_reward": 0.9464286267757416, + "step": 1703 + }, + { + "completion_length": 821.9486999511719, + "epoch": 0.5089985811365842, + "grad_norm": 2.8620383739471436, + "kl": 0.69970703125, + "learning_rate": 6.140839118467825e-07, + "loss": 0.0654, + "reward": 1.0401785969734192, + "reward_std": 0.2873000204563141, + "rewards/accuracy_reward": 0.10491071874275804, + "rewards/format_reward": 0.9352678954601288, + "step": 1704 + }, + { + "completion_length": 984.9174499511719, + "epoch": 0.5092972892241058, + "grad_norm": 2.6460328102111816, + "kl": 0.7724609375, + "learning_rate": 6.13619299378924e-07, + "loss": 0.0685, + "reward": 1.0669642984867096, + "reward_std": 0.251096922904253, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9397321790456772, + "step": 1705 + }, + { + "completion_length": 860.4621124267578, + "epoch": 0.5095959973116272, + "grad_norm": 6.460272789001465, + "kl": 0.939453125, + "learning_rate": 6.131546176995033e-07, + "loss": 0.0598, + "reward": 1.0558036267757416, + "reward_std": 0.2594204246997833, + "rewards/accuracy_reward": 0.10267857555299997, + "rewards/format_reward": 0.9531250298023224, + "step": 1706 + }, + { + "completion_length": 952.7656707763672, + "epoch": 0.5098947053991487, + "grad_norm": 7.826502799987793, + "kl": 1.1064453125, + "learning_rate": 6.126898673140483e-07, + "loss": 0.0431, + "reward": 1.0535714477300644, + "reward_std": 0.24922434985637665, + "rewards/accuracy_reward": 0.10267857927829027, + "rewards/format_reward": 0.9508928954601288, + "step": 1707 + }, + { + "completion_length": 943.3080902099609, + "epoch": 0.5101934134866701, + "grad_norm": 16.789560317993164, + "kl": 1.34765625, + "learning_rate": 6.122250487281621e-07, + "loss": 0.0658, + "reward": 1.1026786267757416, + "reward_std": 0.1987937055528164, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.9642857760190964, + "step": 1708 + }, + { + "completion_length": 871.3370971679688, + "epoch": 0.5104921215741917, + "grad_norm": 5.69761323928833, + "kl": 1.12939453125, + "learning_rate": 6.117601624475214e-07, + "loss": 0.0475, + "reward": 1.1406250596046448, + "reward_std": 0.326728880405426, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9285714626312256, + "step": 1709 + }, + { + "completion_length": 849.8795013427734, + "epoch": 0.5107908296617131, + "grad_norm": 1.4485299587249756, + "kl": 0.7705078125, + "learning_rate": 6.11295208977877e-07, + "loss": 0.0131, + "reward": 1.1852679252624512, + "reward_std": 0.2493140660226345, + "rewards/accuracy_reward": 0.2343750149011612, + "rewards/format_reward": 0.9508928954601288, + "step": 1710 + }, + { + "completion_length": 840.9330749511719, + "epoch": 0.5110895377492346, + "grad_norm": 1.0065926313400269, + "kl": 0.6484375, + "learning_rate": 6.10830188825053e-07, + "loss": 0.0375, + "reward": 1.2254464626312256, + "reward_std": 0.28129101544618607, + "rewards/accuracy_reward": 0.2589285932481289, + "rewards/format_reward": 0.96651791036129, + "step": 1711 + }, + { + "completion_length": 912.7857513427734, + "epoch": 0.511388245836756, + "grad_norm": 1.4217075109481812, + "kl": 0.80859375, + "learning_rate": 6.103651024949454e-07, + "loss": 0.0265, + "reward": 1.0892857313156128, + "reward_std": 0.2854500077664852, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9285714626312256, + "step": 1712 + }, + { + "completion_length": 873.3303985595703, + "epoch": 0.5116869539242775, + "grad_norm": 2.1550886631011963, + "kl": 0.44140625, + "learning_rate": 6.098999504935228e-07, + "loss": 0.0298, + "reward": 1.0848214626312256, + "reward_std": 0.21389195509254932, + "rewards/accuracy_reward": 0.12500000302679837, + "rewards/format_reward": 0.9598214626312256, + "step": 1713 + }, + { + "completion_length": 927.5692291259766, + "epoch": 0.511985662011799, + "grad_norm": 1.8727043867111206, + "kl": 0.3740234375, + "learning_rate": 6.094347333268251e-07, + "loss": 0.0712, + "reward": 1.1540178954601288, + "reward_std": 0.21597838774323463, + "rewards/accuracy_reward": 0.17857144307345152, + "rewards/format_reward": 0.9754464626312256, + "step": 1714 + }, + { + "completion_length": 931.5893096923828, + "epoch": 0.5122843700993205, + "grad_norm": 2.2006025314331055, + "kl": 0.4619140625, + "learning_rate": 6.089694515009624e-07, + "loss": 0.0496, + "reward": 1.1540179252624512, + "reward_std": 0.25072114914655685, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.9642857611179352, + "step": 1715 + }, + { + "completion_length": 870.0000457763672, + "epoch": 0.5125830781868419, + "grad_norm": 1.8346506357192993, + "kl": 0.48388671875, + "learning_rate": 6.085041055221161e-07, + "loss": 0.0628, + "reward": 1.116071492433548, + "reward_std": 0.3047318682074547, + "rewards/accuracy_reward": 0.17857143841683865, + "rewards/format_reward": 0.9375000447034836, + "step": 1716 + }, + { + "completion_length": 917.0111999511719, + "epoch": 0.5128817862743634, + "grad_norm": 2.3750481605529785, + "kl": 0.6513671875, + "learning_rate": 6.080386958965374e-07, + "loss": 0.0223, + "reward": 1.1339286267757416, + "reward_std": 0.24327952414751053, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.9486607611179352, + "step": 1717 + }, + { + "completion_length": 964.6183319091797, + "epoch": 0.5131804943618848, + "grad_norm": 0.8145914673805237, + "kl": 0.5068359375, + "learning_rate": 6.075732231305457e-07, + "loss": 0.0363, + "reward": 1.0848214775323868, + "reward_std": 0.22964514791965485, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9464286118745804, + "step": 1718 + }, + { + "completion_length": 864.0536193847656, + "epoch": 0.5134792024494064, + "grad_norm": 0.953963577747345, + "kl": 0.666748046875, + "learning_rate": 6.071076877305299e-07, + "loss": 0.0239, + "reward": 1.1361607909202576, + "reward_std": 0.23021133989095688, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.957589328289032, + "step": 1719 + }, + { + "completion_length": 899.6830749511719, + "epoch": 0.5137779105369278, + "grad_norm": 1.8186429738998413, + "kl": 0.64404296875, + "learning_rate": 6.066420902029472e-07, + "loss": 0.035, + "reward": 1.0892857611179352, + "reward_std": 0.2772235535085201, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.9531250447034836, + "step": 1720 + }, + { + "completion_length": 943.3058471679688, + "epoch": 0.5140766186244493, + "grad_norm": 1.182373285293579, + "kl": 0.982421875, + "learning_rate": 6.061764310543219e-07, + "loss": 0.0105, + "reward": 1.0156250447034836, + "reward_std": 0.25376710668206215, + "rewards/accuracy_reward": 0.08705357322469354, + "rewards/format_reward": 0.9285714775323868, + "step": 1721 + }, + { + "completion_length": 936.6964721679688, + "epoch": 0.5143753267119707, + "grad_norm": 1.350132703781128, + "kl": 0.89990234375, + "learning_rate": 6.057107107912453e-07, + "loss": 0.0433, + "reward": 1.0714286267757416, + "reward_std": 0.27406859770417213, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9419643133878708, + "step": 1722 + }, + { + "completion_length": 928.513427734375, + "epoch": 0.5146740347994921, + "grad_norm": 2.3604440689086914, + "kl": 1.283203125, + "learning_rate": 6.052449299203758e-07, + "loss": 0.0187, + "reward": 1.0714285969734192, + "reward_std": 0.25615815445780754, + "rewards/accuracy_reward": 0.14062500605359674, + "rewards/format_reward": 0.9308036118745804, + "step": 1723 + }, + { + "completion_length": 974.9107513427734, + "epoch": 0.5149727428870137, + "grad_norm": 2.433929920196533, + "kl": 0.88671875, + "learning_rate": 6.047790889484369e-07, + "loss": 0.0241, + "reward": 1.1897321939468384, + "reward_std": 0.278184499591589, + "rewards/accuracy_reward": 0.2433035783469677, + "rewards/format_reward": 0.9464286267757416, + "step": 1724 + }, + { + "completion_length": 947.0826416015625, + "epoch": 0.5152714509745351, + "grad_norm": 2.841384172439575, + "kl": 1.005859375, + "learning_rate": 6.043131883822185e-07, + "loss": 0.0456, + "reward": 1.0580357313156128, + "reward_std": 0.20904503017663956, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.9598214775323868, + "step": 1725 + }, + { + "completion_length": 963.9330902099609, + "epoch": 0.5155701590620566, + "grad_norm": 3.5189425945281982, + "kl": 1.236328125, + "learning_rate": 6.038472287285741e-07, + "loss": 0.0566, + "reward": 1.0758928954601288, + "reward_std": 0.25643105432391167, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9531250298023224, + "step": 1726 + }, + { + "completion_length": 985.8147583007812, + "epoch": 0.515868867149578, + "grad_norm": 1.4155867099761963, + "kl": 0.892578125, + "learning_rate": 6.033812104944227e-07, + "loss": 0.0051, + "reward": 1.0401786118745804, + "reward_std": 0.19728046655654907, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.957589328289032, + "step": 1727 + }, + { + "completion_length": 982.6607513427734, + "epoch": 0.5161675752370996, + "grad_norm": 0.8324518799781799, + "kl": 0.59130859375, + "learning_rate": 6.02915134186746e-07, + "loss": 0.0075, + "reward": 1.2075892984867096, + "reward_std": 0.238079734146595, + "rewards/accuracy_reward": 0.243303582072258, + "rewards/format_reward": 0.9642857760190964, + "step": 1728 + }, + { + "completion_length": 960.2924652099609, + "epoch": 0.516466283324621, + "grad_norm": 2.2280256748199463, + "kl": 0.7109375, + "learning_rate": 6.024490003125896e-07, + "loss": 0.0133, + "reward": 1.051339328289032, + "reward_std": 0.2510583624243736, + "rewards/accuracy_reward": 0.11830357555299997, + "rewards/format_reward": 0.9330357611179352, + "step": 1729 + }, + { + "completion_length": 895.2411041259766, + "epoch": 0.5167649914121425, + "grad_norm": 1.7914071083068848, + "kl": 0.6875, + "learning_rate": 6.019828093790613e-07, + "loss": -0.0017, + "reward": 1.2321428954601288, + "reward_std": 0.25837991014122963, + "rewards/accuracy_reward": 0.28571430407464504, + "rewards/format_reward": 0.9464285969734192, + "step": 1730 + }, + { + "completion_length": 989.7255096435547, + "epoch": 0.5170636994996639, + "grad_norm": 2.9603872299194336, + "kl": 0.70458984375, + "learning_rate": 6.015165618933315e-07, + "loss": -0.001, + "reward": 1.0915178954601288, + "reward_std": 0.29240186139941216, + "rewards/accuracy_reward": 0.14732143096625805, + "rewards/format_reward": 0.9441964775323868, + "step": 1731 + }, + { + "completion_length": 991.7991638183594, + "epoch": 0.5173624075871854, + "grad_norm": 1.0640382766723633, + "kl": 0.65087890625, + "learning_rate": 6.010502583626314e-07, + "loss": 0.049, + "reward": 1.1049107909202576, + "reward_std": 0.2596750780940056, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.9575893133878708, + "step": 1732 + }, + { + "completion_length": 994.9687957763672, + "epoch": 0.5176611156747069, + "grad_norm": 1.0573147535324097, + "kl": 0.595703125, + "learning_rate": 6.005838992942536e-07, + "loss": 0.0635, + "reward": 1.145089328289032, + "reward_std": 0.2546224743127823, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.9575893133878708, + "step": 1733 + }, + { + "completion_length": 988.5312805175781, + "epoch": 0.5179598237622284, + "grad_norm": 1.2879401445388794, + "kl": 0.6748046875, + "learning_rate": 6.001174851955512e-07, + "loss": 0.0153, + "reward": 1.1517857909202576, + "reward_std": 0.23239846527576447, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.9464286118745804, + "step": 1734 + }, + { + "completion_length": 975.9330902099609, + "epoch": 0.5182585318497498, + "grad_norm": 0.9814623594284058, + "kl": 0.8486328125, + "learning_rate": 5.99651016573937e-07, + "loss": 0.0263, + "reward": 1.0401786267757416, + "reward_std": 0.21332362666726112, + "rewards/accuracy_reward": 0.09375000349245965, + "rewards/format_reward": 0.9464286118745804, + "step": 1735 + }, + { + "completion_length": 966.8014068603516, + "epoch": 0.5185572399372713, + "grad_norm": 1.2816224098205566, + "kl": 1.1318359375, + "learning_rate": 5.99184493936883e-07, + "loss": 0.019, + "reward": 1.1093750596046448, + "reward_std": 0.362427294254303, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.926339328289032, + "step": 1736 + }, + { + "completion_length": 986.310302734375, + "epoch": 0.5188559480247927, + "grad_norm": 1.2681858539581299, + "kl": 0.9248046875, + "learning_rate": 5.987179177919202e-07, + "loss": 0.0643, + "reward": 1.004464328289032, + "reward_std": 0.2656087428331375, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.9308036118745804, + "step": 1737 + }, + { + "completion_length": 974.5781707763672, + "epoch": 0.5191546561123143, + "grad_norm": 1.8891994953155518, + "kl": 1.0166015625, + "learning_rate": 5.982512886466377e-07, + "loss": -0.013, + "reward": 1.0000000298023224, + "reward_std": 0.30107778683304787, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.9241071790456772, + "step": 1738 + }, + { + "completion_length": 971.2902221679688, + "epoch": 0.5194533641998357, + "grad_norm": 1.851032018661499, + "kl": 1.07861328125, + "learning_rate": 5.977846070086823e-07, + "loss": 0.0003, + "reward": 1.0647321939468384, + "reward_std": 0.26765621826052666, + "rewards/accuracy_reward": 0.12946429196745157, + "rewards/format_reward": 0.93526791036129, + "step": 1739 + }, + { + "completion_length": 915.2389068603516, + "epoch": 0.5197520722873572, + "grad_norm": 2.314688205718994, + "kl": 0.8408203125, + "learning_rate": 5.973178733857578e-07, + "loss": 0.0565, + "reward": 1.1406250596046448, + "reward_std": 0.20081467926502228, + "rewards/accuracy_reward": 0.17187500861473382, + "rewards/format_reward": 0.9687500298023224, + "step": 1740 + }, + { + "completion_length": 893.6540679931641, + "epoch": 0.5200507803748786, + "grad_norm": 1.4645804166793823, + "kl": 0.693359375, + "learning_rate": 5.968510882856249e-07, + "loss": 0.0518, + "reward": 1.1138393580913544, + "reward_std": 0.21152473613619804, + "rewards/accuracy_reward": 0.1629464402794838, + "rewards/format_reward": 0.95089291036129, + "step": 1741 + }, + { + "completion_length": 925.1228179931641, + "epoch": 0.5203494884624001, + "grad_norm": 2.8041329383850098, + "kl": 0.6708984375, + "learning_rate": 5.963842522160997e-07, + "loss": 0.0341, + "reward": 1.0758928954601288, + "reward_std": 0.2758454754948616, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9486607611179352, + "step": 1742 + }, + { + "completion_length": 951.1451416015625, + "epoch": 0.5206481965499216, + "grad_norm": 1.7179235219955444, + "kl": 0.779296875, + "learning_rate": 5.959173656850543e-07, + "loss": 0.0055, + "reward": 0.979910746216774, + "reward_std": 0.24566993117332458, + "rewards/accuracy_reward": 0.04910714668221772, + "rewards/format_reward": 0.9308035969734192, + "step": 1743 + }, + { + "completion_length": 984.0245819091797, + "epoch": 0.5209469046374431, + "grad_norm": 2.2634267807006836, + "kl": 0.615234375, + "learning_rate": 5.954504292004154e-07, + "loss": 0.0064, + "reward": 1.064732164144516, + "reward_std": 0.2196192927658558, + "rewards/accuracy_reward": 0.10044643492437899, + "rewards/format_reward": 0.964285746216774, + "step": 1744 + }, + { + "completion_length": 1000.9799346923828, + "epoch": 0.5212456127249645, + "grad_norm": 2.5119059085845947, + "kl": 0.6865234375, + "learning_rate": 5.949834432701641e-07, + "loss": 0.0298, + "reward": 1.1049107760190964, + "reward_std": 0.26609769836068153, + "rewards/accuracy_reward": 0.15625000488944352, + "rewards/format_reward": 0.9486607313156128, + "step": 1745 + }, + { + "completion_length": 891.6897583007812, + "epoch": 0.521544320812486, + "grad_norm": 1.156991720199585, + "kl": 0.6904296875, + "learning_rate": 5.945164084023355e-07, + "loss": 0.0413, + "reward": 1.1696429401636124, + "reward_std": 0.25830913707613945, + "rewards/accuracy_reward": 0.2232142996508628, + "rewards/format_reward": 0.9464286118745804, + "step": 1746 + }, + { + "completion_length": 963.1763610839844, + "epoch": 0.5218430289000074, + "grad_norm": 0.9596554040908813, + "kl": 0.552490234375, + "learning_rate": 5.940493251050174e-07, + "loss": -0.0064, + "reward": 1.0200893431901932, + "reward_std": 0.18192357942461967, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.9486607611179352, + "step": 1747 + }, + { + "completion_length": 1007.3951416015625, + "epoch": 0.522141736987529, + "grad_norm": 1.6662253141403198, + "kl": 0.6357421875, + "learning_rate": 5.93582193886351e-07, + "loss": 0.006, + "reward": 0.98214291036129, + "reward_std": 0.22009030729532242, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.9375000596046448, + "step": 1748 + }, + { + "completion_length": 930.6741333007812, + "epoch": 0.5224404450750504, + "grad_norm": 1.5979082584381104, + "kl": 0.5830078125, + "learning_rate": 5.931150152545292e-07, + "loss": 0.0595, + "reward": 1.0602678954601288, + "reward_std": 0.2643696404993534, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9464286118745804, + "step": 1749 + }, + { + "completion_length": 910.3304138183594, + "epoch": 0.5227391531625719, + "grad_norm": 1.7684975862503052, + "kl": 0.64404296875, + "learning_rate": 5.926477897177967e-07, + "loss": 0.0499, + "reward": 1.0468750596046448, + "reward_std": 0.2108190953731537, + "rewards/accuracy_reward": 0.08482143236324191, + "rewards/format_reward": 0.9620536118745804, + "step": 1750 + }, + { + "completion_length": 949.3036193847656, + "epoch": 0.5230378612500933, + "grad_norm": 1.125667691230774, + "kl": 0.712890625, + "learning_rate": 5.921805177844486e-07, + "loss": 0.0107, + "reward": 0.9709822088479996, + "reward_std": 0.21332277730107307, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.944196492433548, + "step": 1751 + }, + { + "completion_length": 1098.9576263427734, + "epoch": 0.5233365693376149, + "grad_norm": 23.117231369018555, + "kl": 2.9091796875, + "learning_rate": 5.917131999628315e-07, + "loss": 0.0562, + "reward": 1.1316964775323868, + "reward_std": 0.2669077403843403, + "rewards/accuracy_reward": 0.18750000861473382, + "rewards/format_reward": 0.9441964626312256, + "step": 1752 + }, + { + "completion_length": 965.7723846435547, + "epoch": 0.5236352774251363, + "grad_norm": 1.8291077613830566, + "kl": 0.7060546875, + "learning_rate": 5.912458367613409e-07, + "loss": 0.0321, + "reward": 1.098214328289032, + "reward_std": 0.305444173514843, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.9375000447034836, + "step": 1753 + }, + { + "completion_length": 935.1339721679688, + "epoch": 0.5239339855126578, + "grad_norm": 0.7529056072235107, + "kl": 0.51708984375, + "learning_rate": 5.907784286884228e-07, + "loss": 0.0398, + "reward": 1.0781250596046448, + "reward_std": 0.2668227795511484, + "rewards/accuracy_reward": 0.1294642947614193, + "rewards/format_reward": 0.9486607760190964, + "step": 1754 + }, + { + "completion_length": 963.5045013427734, + "epoch": 0.5242326936001792, + "grad_norm": 1.1571073532104492, + "kl": 0.4072265625, + "learning_rate": 5.903109762525707e-07, + "loss": 0.037, + "reward": 1.1696428954601288, + "reward_std": 0.25752517953515053, + "rewards/accuracy_reward": 0.2008928693830967, + "rewards/format_reward": 0.9687500447034836, + "step": 1755 + }, + { + "completion_length": 904.2991333007812, + "epoch": 0.5245314016877007, + "grad_norm": 1.2274895906448364, + "kl": 0.3623046875, + "learning_rate": 5.898434799623276e-07, + "loss": 0.0273, + "reward": 1.0491072088479996, + "reward_std": 0.2446090281009674, + "rewards/accuracy_reward": 0.10267857927829027, + "rewards/format_reward": 0.9464286267757416, + "step": 1756 + }, + { + "completion_length": 985.0179138183594, + "epoch": 0.5248301097752222, + "grad_norm": 1.4356715679168701, + "kl": 0.697265625, + "learning_rate": 5.893759403262832e-07, + "loss": 0.0246, + "reward": 1.0669643431901932, + "reward_std": 0.2652854770421982, + "rewards/accuracy_reward": 0.14285715157166123, + "rewards/format_reward": 0.9241071790456772, + "step": 1757 + }, + { + "completion_length": 971.0580749511719, + "epoch": 0.5251288178627437, + "grad_norm": 1.280410647392273, + "kl": 0.477294921875, + "learning_rate": 5.889083578530752e-07, + "loss": 0.0238, + "reward": 1.1584821939468384, + "reward_std": 0.1799035184085369, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.9776786118745804, + "step": 1758 + }, + { + "completion_length": 875.7946929931641, + "epoch": 0.5254275259502651, + "grad_norm": 1.6494965553283691, + "kl": 0.455078125, + "learning_rate": 5.88440733051387e-07, + "loss": 0.0436, + "reward": 1.131696492433548, + "reward_std": 0.24542837589979172, + "rewards/accuracy_reward": 0.17410715110599995, + "rewards/format_reward": 0.9575893431901932, + "step": 1759 + }, + { + "completion_length": 807.6630096435547, + "epoch": 0.5257262340377866, + "grad_norm": 2.458458662033081, + "kl": 0.54638671875, + "learning_rate": 5.87973066429949e-07, + "loss": 0.0015, + "reward": 1.1473214626312256, + "reward_std": 0.3064127303659916, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9598214626312256, + "step": 1760 + }, + { + "completion_length": 916.2969055175781, + "epoch": 0.526024942125308, + "grad_norm": 2.3966562747955322, + "kl": 0.51953125, + "learning_rate": 5.875053584975365e-07, + "loss": 0.0602, + "reward": 1.0803571939468384, + "reward_std": 0.25933191925287247, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9397321790456772, + "step": 1761 + }, + { + "completion_length": 878.935302734375, + "epoch": 0.5263236502128296, + "grad_norm": 1.0706305503845215, + "kl": 0.51123046875, + "learning_rate": 5.870376097629698e-07, + "loss": -0.0031, + "reward": 1.1562500596046448, + "reward_std": 0.2762843184173107, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.933035746216774, + "step": 1762 + }, + { + "completion_length": 961.7098693847656, + "epoch": 0.526622358300351, + "grad_norm": 1.5879932641983032, + "kl": 0.890625, + "learning_rate": 5.865698207351138e-07, + "loss": 0.049, + "reward": 1.0870536118745804, + "reward_std": 0.28707873448729515, + "rewards/accuracy_reward": 0.15625000465661287, + "rewards/format_reward": 0.9308035969734192, + "step": 1763 + }, + { + "completion_length": 867.4308319091797, + "epoch": 0.5269210663878725, + "grad_norm": 1.5983375310897827, + "kl": 0.908203125, + "learning_rate": 5.861019919228769e-07, + "loss": 0.0294, + "reward": 1.0602679252624512, + "reward_std": 0.23188291117548943, + "rewards/accuracy_reward": 0.10491072200238705, + "rewards/format_reward": 0.9553571939468384, + "step": 1764 + }, + { + "completion_length": 967.2031555175781, + "epoch": 0.5272197744753939, + "grad_norm": 2.4823052883148193, + "kl": 1.0947265625, + "learning_rate": 5.856341238352114e-07, + "loss": 0.0248, + "reward": 1.037946492433548, + "reward_std": 0.21401330828666687, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.948660746216774, + "step": 1765 + }, + { + "completion_length": 908.2455902099609, + "epoch": 0.5275184825629153, + "grad_norm": 1.8252514600753784, + "kl": 0.78515625, + "learning_rate": 5.851662169811116e-07, + "loss": -0.0186, + "reward": 1.0424107909202576, + "reward_std": 0.25850632041692734, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.9375000596046448, + "step": 1766 + }, + { + "completion_length": 995.9531402587891, + "epoch": 0.5278171906504369, + "grad_norm": 2.1154093742370605, + "kl": 0.9716796875, + "learning_rate": 5.846982718696143e-07, + "loss": 0.0239, + "reward": 1.0959821939468384, + "reward_std": 0.23134801536798477, + "rewards/accuracy_reward": 0.14508929196745157, + "rewards/format_reward": 0.9508928954601288, + "step": 1767 + }, + { + "completion_length": 1035.729965209961, + "epoch": 0.5281158987379583, + "grad_norm": 15.90575885772705, + "kl": 1.34765625, + "learning_rate": 5.842302890097981e-07, + "loss": 0.0444, + "reward": 1.0558035969734192, + "reward_std": 0.23370064422488213, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.957589328289032, + "step": 1768 + }, + { + "completion_length": 1070.9152221679688, + "epoch": 0.5284146068254798, + "grad_norm": 1.039475679397583, + "kl": 0.60302734375, + "learning_rate": 5.837622689107823e-07, + "loss": 0.0437, + "reward": 1.0803571939468384, + "reward_std": 0.2401709482073784, + "rewards/accuracy_reward": 0.12500000838190317, + "rewards/format_reward": 0.9553571939468384, + "step": 1769 + }, + { + "completion_length": 942.0803985595703, + "epoch": 0.5287133149130012, + "grad_norm": 3.7478723526000977, + "kl": 0.8642578125, + "learning_rate": 5.83294212081727e-07, + "loss": 0.0384, + "reward": 1.1160714626312256, + "reward_std": 0.3057986944913864, + "rewards/accuracy_reward": 0.16964286379516125, + "rewards/format_reward": 0.9464286118745804, + "step": 1770 + }, + { + "completion_length": 942.8237152099609, + "epoch": 0.5290120230005227, + "grad_norm": 2.2152047157287598, + "kl": 0.63330078125, + "learning_rate": 5.828261190318323e-07, + "loss": 0.034, + "reward": 1.1227678954601288, + "reward_std": 0.19031227752566338, + "rewards/accuracy_reward": 0.1629464328289032, + "rewards/format_reward": 0.9598214775323868, + "step": 1771 + }, + { + "completion_length": 937.0469360351562, + "epoch": 0.5293107310880442, + "grad_norm": 1.1060088872909546, + "kl": 0.515625, + "learning_rate": 5.823579902703373e-07, + "loss": -0.024, + "reward": 1.1406250447034836, + "reward_std": 0.22800015285611153, + "rewards/accuracy_reward": 0.18303572433069348, + "rewards/format_reward": 0.9575893431901932, + "step": 1772 + }, + { + "completion_length": 944.2388763427734, + "epoch": 0.5296094391755657, + "grad_norm": 1.0554438829421997, + "kl": 0.6552734375, + "learning_rate": 5.818898263065203e-07, + "loss": 0.0286, + "reward": 1.1986607909202576, + "reward_std": 0.26161758229136467, + "rewards/accuracy_reward": 0.23660715483129025, + "rewards/format_reward": 0.9620535969734192, + "step": 1773 + }, + { + "completion_length": 918.2902374267578, + "epoch": 0.5299081472630871, + "grad_norm": 1.2875337600708008, + "kl": 0.8505859375, + "learning_rate": 5.814216276496978e-07, + "loss": -0.0004, + "reward": 1.109375074505806, + "reward_std": 0.32558736205101013, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.941964328289032, + "step": 1774 + }, + { + "completion_length": 1014.8549346923828, + "epoch": 0.5302068553506086, + "grad_norm": 1.285003423690796, + "kl": 0.5498046875, + "learning_rate": 5.809533948092241e-07, + "loss": -0.0075, + "reward": 1.1049107611179352, + "reward_std": 0.25390948727726936, + "rewards/accuracy_reward": 0.14955358067527413, + "rewards/format_reward": 0.9553571790456772, + "step": 1775 + }, + { + "completion_length": 1081.2076568603516, + "epoch": 0.53050556343813, + "grad_norm": 2.9693636894226074, + "kl": 0.859375, + "learning_rate": 5.804851282944905e-07, + "loss": 0.0425, + "reward": 1.1428571939468384, + "reward_std": 0.22274867817759514, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9709821790456772, + "step": 1776 + }, + { + "completion_length": 957.2388916015625, + "epoch": 0.5308042715256516, + "grad_norm": 1.356405258178711, + "kl": 0.73046875, + "learning_rate": 5.800168286149254e-07, + "loss": 0.0194, + "reward": 1.2232143580913544, + "reward_std": 0.296751469373703, + "rewards/accuracy_reward": 0.2700892984867096, + "rewards/format_reward": 0.9531250298023224, + "step": 1777 + }, + { + "completion_length": 976.7567291259766, + "epoch": 0.531102979613173, + "grad_norm": 0.9274383187294006, + "kl": 0.77734375, + "learning_rate": 5.795484962799924e-07, + "loss": 0.0367, + "reward": 1.160714328289032, + "reward_std": 0.2734021805226803, + "rewards/accuracy_reward": 0.19866072246804833, + "rewards/format_reward": 0.9620535969734192, + "step": 1778 + }, + { + "completion_length": 997.8817443847656, + "epoch": 0.5314016877006945, + "grad_norm": 0.8650025129318237, + "kl": 0.734375, + "learning_rate": 5.790801317991919e-07, + "loss": 0.0199, + "reward": 1.1718750298023224, + "reward_std": 0.2672051191329956, + "rewards/accuracy_reward": 0.2254464365541935, + "rewards/format_reward": 0.9464286118745804, + "step": 1779 + }, + { + "completion_length": 1033.3170318603516, + "epoch": 0.5317003957882159, + "grad_norm": 1.418544888496399, + "kl": 0.625, + "learning_rate": 5.786117356820579e-07, + "loss": 0.0131, + "reward": 1.0803571790456772, + "reward_std": 0.253945242613554, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9531250447034836, + "step": 1780 + }, + { + "completion_length": 984.2902069091797, + "epoch": 0.5319991038757375, + "grad_norm": 0.8559054732322693, + "kl": 0.6767578125, + "learning_rate": 5.781433084381599e-07, + "loss": 0.0227, + "reward": 1.1093750596046448, + "reward_std": 0.28973061591386795, + "rewards/accuracy_reward": 0.1629464402794838, + "rewards/format_reward": 0.9464286267757416, + "step": 1781 + }, + { + "completion_length": 935.5380096435547, + "epoch": 0.5322978119632589, + "grad_norm": 0.9257299304008484, + "kl": 0.6484375, + "learning_rate": 5.776748505771005e-07, + "loss": 0.0077, + "reward": 1.2165178954601288, + "reward_std": 0.2788492515683174, + "rewards/accuracy_reward": 0.2633928656578064, + "rewards/format_reward": 0.9531250298023224, + "step": 1782 + }, + { + "completion_length": 988.0178985595703, + "epoch": 0.5325965200507804, + "grad_norm": 2.480884313583374, + "kl": 0.6875, + "learning_rate": 5.77206362608516e-07, + "loss": -0.0003, + "reward": 1.0647322088479996, + "reward_std": 0.2588692456483841, + "rewards/accuracy_reward": 0.12500000861473382, + "rewards/format_reward": 0.939732164144516, + "step": 1783 + }, + { + "completion_length": 1012.0580749511719, + "epoch": 0.5328952281383018, + "grad_norm": 0.9554315209388733, + "kl": 0.78076171875, + "learning_rate": 5.767378450420758e-07, + "loss": 0.024, + "reward": 1.06026791036129, + "reward_std": 0.26540490612387657, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.9441964775323868, + "step": 1784 + }, + { + "completion_length": 948.3058471679688, + "epoch": 0.5331939362258233, + "grad_norm": 1.4943337440490723, + "kl": 0.6953125, + "learning_rate": 5.762692983874806e-07, + "loss": 0.0316, + "reward": 1.3303571939468384, + "reward_std": 0.36386604607105255, + "rewards/accuracy_reward": 0.381696455180645, + "rewards/format_reward": 0.9486607611179352, + "step": 1785 + }, + { + "completion_length": 886.8281707763672, + "epoch": 0.5334926443133448, + "grad_norm": 0.481302946805954, + "kl": 0.47607421875, + "learning_rate": 5.758007231544636e-07, + "loss": 0.0152, + "reward": 1.095982164144516, + "reward_std": 0.18093354254961014, + "rewards/accuracy_reward": 0.13169643376022577, + "rewards/format_reward": 0.9642857611179352, + "step": 1786 + }, + { + "completion_length": 889.2745819091797, + "epoch": 0.5337913524008663, + "grad_norm": 1.0481775999069214, + "kl": 0.58251953125, + "learning_rate": 5.753321198527883e-07, + "loss": 0.0269, + "reward": 1.1718750596046448, + "reward_std": 0.1845012092962861, + "rewards/accuracy_reward": 0.20312500931322575, + "rewards/format_reward": 0.9687500298023224, + "step": 1787 + }, + { + "completion_length": 1066.8861999511719, + "epoch": 0.5340900604883877, + "grad_norm": 1.245798945426941, + "kl": 0.471923828125, + "learning_rate": 5.748634889922494e-07, + "loss": -0.0132, + "reward": 1.129464328289032, + "reward_std": 0.2880612462759018, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.93526791036129, + "step": 1788 + }, + { + "completion_length": 913.7210235595703, + "epoch": 0.5343887685759092, + "grad_norm": 0.9491491913795471, + "kl": 0.56005859375, + "learning_rate": 5.743948310826716e-07, + "loss": -0.0078, + "reward": 1.0870536267757416, + "reward_std": 0.20499612018465996, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.95089291036129, + "step": 1789 + }, + { + "completion_length": 983.8303985595703, + "epoch": 0.5346874766634306, + "grad_norm": 1.1409772634506226, + "kl": 0.421875, + "learning_rate": 5.739261466339083e-07, + "loss": 0.0175, + "reward": 1.1272321939468384, + "reward_std": 0.21645399183034897, + "rewards/accuracy_reward": 0.17410714784637094, + "rewards/format_reward": 0.9531250447034836, + "step": 1790 + }, + { + "completion_length": 925.5201416015625, + "epoch": 0.5349861847509522, + "grad_norm": 1.4984846115112305, + "kl": 0.38427734375, + "learning_rate": 5.734574361558427e-07, + "loss": 0.0008, + "reward": 1.1250000596046448, + "reward_std": 0.2728953883051872, + "rewards/accuracy_reward": 0.17410715110599995, + "rewards/format_reward": 0.9508928805589676, + "step": 1791 + }, + { + "completion_length": 879.9062805175781, + "epoch": 0.5352848928384736, + "grad_norm": 1.3025548458099365, + "kl": 0.3876953125, + "learning_rate": 5.729887001583857e-07, + "loss": 0.0506, + "reward": 1.1450893580913544, + "reward_std": 0.23327889293432236, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9508928954601288, + "step": 1792 + }, + { + "completion_length": 902.4486999511719, + "epoch": 0.5355836009259951, + "grad_norm": 1.2987232208251953, + "kl": 0.4794921875, + "learning_rate": 5.725199391514757e-07, + "loss": 0.0461, + "reward": 1.1830357611179352, + "reward_std": 0.22141234204173088, + "rewards/accuracy_reward": 0.21875000931322575, + "rewards/format_reward": 0.9642857760190964, + "step": 1793 + }, + { + "completion_length": 885.3594207763672, + "epoch": 0.5358823090135165, + "grad_norm": 1.1200435161590576, + "kl": 0.44189453125, + "learning_rate": 5.720511536450793e-07, + "loss": 0.0115, + "reward": 1.049107164144516, + "reward_std": 0.20247988030314445, + "rewards/accuracy_reward": 0.07366071920841932, + "rewards/format_reward": 0.975446492433548, + "step": 1794 + }, + { + "completion_length": 975.6964721679688, + "epoch": 0.536181017101038, + "grad_norm": 3.9705991744995117, + "kl": 0.7236328125, + "learning_rate": 5.715823441491889e-07, + "loss": 0.0446, + "reward": 1.2187500596046448, + "reward_std": 0.35093751549720764, + "rewards/accuracy_reward": 0.29241072200238705, + "rewards/format_reward": 0.926339328289032, + "step": 1795 + }, + { + "completion_length": 945.5134429931641, + "epoch": 0.5364797251885595, + "grad_norm": 1.0042123794555664, + "kl": 0.439453125, + "learning_rate": 5.711135111738236e-07, + "loss": 0.0211, + "reward": 1.1450893580913544, + "reward_std": 0.304103285074234, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9620536118745804, + "step": 1796 + }, + { + "completion_length": 943.6585388183594, + "epoch": 0.536778433276081, + "grad_norm": 1.299076795578003, + "kl": 0.57470703125, + "learning_rate": 5.706446552290272e-07, + "loss": 0.0502, + "reward": 1.178571492433548, + "reward_std": 0.3172904774546623, + "rewards/accuracy_reward": 0.2276785857975483, + "rewards/format_reward": 0.9508928954601288, + "step": 1797 + }, + { + "completion_length": 908.6786041259766, + "epoch": 0.5370771413636024, + "grad_norm": 1.7653553485870361, + "kl": 0.6337890625, + "learning_rate": 5.701757768248693e-07, + "loss": 0.0036, + "reward": 1.0848214328289032, + "reward_std": 0.2347123697400093, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.95089291036129, + "step": 1798 + }, + { + "completion_length": 865.9866485595703, + "epoch": 0.5373758494511239, + "grad_norm": 1.319387435913086, + "kl": 0.61865234375, + "learning_rate": 5.697068764714439e-07, + "loss": 0.0555, + "reward": 1.0803571939468384, + "reward_std": 0.20210149884223938, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.9709821939468384, + "step": 1799 + }, + { + "completion_length": 974.4844055175781, + "epoch": 0.5376745575386453, + "grad_norm": 1.6136637926101685, + "kl": 0.64111328125, + "learning_rate": 5.692379546788683e-07, + "loss": 0.0359, + "reward": 1.10714291036129, + "reward_std": 0.2837919592857361, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9531250447034836, + "step": 1800 + }, + { + "completion_length": 920.4129943847656, + "epoch": 0.5379732656261669, + "grad_norm": 1.1147799491882324, + "kl": 0.84765625, + "learning_rate": 5.687690119572835e-07, + "loss": 0.0057, + "reward": 1.069196492433548, + "reward_std": 0.2672669589519501, + "rewards/accuracy_reward": 0.13616072316654027, + "rewards/format_reward": 0.933035746216774, + "step": 1801 + }, + { + "completion_length": 955.5379791259766, + "epoch": 0.5382719737136883, + "grad_norm": 1.2727131843566895, + "kl": 0.64990234375, + "learning_rate": 5.683000488168533e-07, + "loss": 0.0471, + "reward": 1.0959821939468384, + "reward_std": 0.2549469992518425, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.9441964626312256, + "step": 1802 + }, + { + "completion_length": 970.763427734375, + "epoch": 0.5385706818012098, + "grad_norm": 1.612112283706665, + "kl": 0.53759765625, + "learning_rate": 5.678310657677634e-07, + "loss": 0.0248, + "reward": 1.0870535969734192, + "reward_std": 0.2616583779454231, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.941964328289032, + "step": 1803 + }, + { + "completion_length": 861.6763763427734, + "epoch": 0.5388693898887312, + "grad_norm": 1.7629995346069336, + "kl": 0.59521484375, + "learning_rate": 5.673620633202217e-07, + "loss": -0.0023, + "reward": 0.9866071939468384, + "reward_std": 0.2650345042347908, + "rewards/accuracy_reward": 0.055803574388846755, + "rewards/format_reward": 0.9308036118745804, + "step": 1804 + }, + { + "completion_length": 872.4129943847656, + "epoch": 0.5391680979762528, + "grad_norm": 1.4386945962905884, + "kl": 0.6533203125, + "learning_rate": 5.668930419844568e-07, + "loss": -0.0105, + "reward": 0.9754464775323868, + "reward_std": 0.20916741341352463, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.9464286118745804, + "step": 1805 + }, + { + "completion_length": 956.1250457763672, + "epoch": 0.5394668060637742, + "grad_norm": 2.2137789726257324, + "kl": 0.6630859375, + "learning_rate": 5.664240022707179e-07, + "loss": -0.0056, + "reward": 1.0491072088479996, + "reward_std": 0.27053192257881165, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.9375000298023224, + "step": 1806 + }, + { + "completion_length": 886.2723693847656, + "epoch": 0.5397655141512957, + "grad_norm": 1.9206621646881104, + "kl": 0.7548828125, + "learning_rate": 5.659549446892743e-07, + "loss": 0.0176, + "reward": 1.1026786267757416, + "reward_std": 0.29630066081881523, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9486607611179352, + "step": 1807 + }, + { + "completion_length": 828.6205596923828, + "epoch": 0.5400642222388171, + "grad_norm": 2.188901901245117, + "kl": 0.6640625, + "learning_rate": 5.654858697504145e-07, + "loss": 0.0279, + "reward": 1.0691964626312256, + "reward_std": 0.30829817801713943, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9107143133878708, + "step": 1808 + }, + { + "completion_length": 884.1920013427734, + "epoch": 0.5403629303263385, + "grad_norm": 6.836246490478516, + "kl": 0.6357421875, + "learning_rate": 5.650167779644464e-07, + "loss": 0.0088, + "reward": 1.051339328289032, + "reward_std": 0.23392919823527336, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.9397321790456772, + "step": 1809 + }, + { + "completion_length": 867.9308624267578, + "epoch": 0.5406616384138601, + "grad_norm": 1.8714486360549927, + "kl": 0.441162109375, + "learning_rate": 5.645476698416954e-07, + "loss": 0.0476, + "reward": 1.0714286267757416, + "reward_std": 0.23844574764370918, + "rewards/accuracy_reward": 0.10714285867288709, + "rewards/format_reward": 0.9642857611179352, + "step": 1810 + }, + { + "completion_length": 844.6027069091797, + "epoch": 0.5409603465013815, + "grad_norm": 1.3475745916366577, + "kl": 0.57373046875, + "learning_rate": 5.640785458925057e-07, + "loss": 0.0267, + "reward": 1.1116071939468384, + "reward_std": 0.18635160475969315, + "rewards/accuracy_reward": 0.14732143562287092, + "rewards/format_reward": 0.964285746216774, + "step": 1811 + }, + { + "completion_length": 832.9263610839844, + "epoch": 0.541259054588903, + "grad_norm": 0.5914788246154785, + "kl": 0.53955078125, + "learning_rate": 5.636094066272377e-07, + "loss": 0.0116, + "reward": 1.0781250596046448, + "reward_std": 0.24234626442193985, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.957589328289032, + "step": 1812 + }, + { + "completion_length": 859.9286193847656, + "epoch": 0.5415577626764244, + "grad_norm": 0.9950949549674988, + "kl": 0.49853515625, + "learning_rate": 5.631402525562693e-07, + "loss": 0.0433, + "reward": 1.10714291036129, + "reward_std": 0.2811945490539074, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9598214626312256, + "step": 1813 + }, + { + "completion_length": 854.872802734375, + "epoch": 0.5418564707639459, + "grad_norm": 1.5567495822906494, + "kl": 0.82080078125, + "learning_rate": 5.62671084189994e-07, + "loss": 0.0323, + "reward": 1.098214328289032, + "reward_std": 0.28765855729579926, + "rewards/accuracy_reward": 0.15625000977888703, + "rewards/format_reward": 0.9419643133878708, + "step": 1814 + }, + { + "completion_length": 861.8995971679688, + "epoch": 0.5421551788514674, + "grad_norm": 2.0452420711517334, + "kl": 0.767578125, + "learning_rate": 5.622019020388208e-07, + "loss": 0.0607, + "reward": 1.1227679252624512, + "reward_std": 0.2745203226804733, + "rewards/accuracy_reward": 0.16741071827709675, + "rewards/format_reward": 0.9553571790456772, + "step": 1815 + }, + { + "completion_length": 906.9888610839844, + "epoch": 0.5424538869389889, + "grad_norm": 0.9320907592773438, + "kl": 0.5185546875, + "learning_rate": 5.617327066131744e-07, + "loss": 0.0316, + "reward": 1.0357143133878708, + "reward_std": 0.263116292655468, + "rewards/accuracy_reward": 0.08482143213041127, + "rewards/format_reward": 0.95089291036129, + "step": 1816 + }, + { + "completion_length": 905.4732360839844, + "epoch": 0.5427525950265103, + "grad_norm": 1.1866605281829834, + "kl": 0.60400390625, + "learning_rate": 5.612634984234929e-07, + "loss": 0.0538, + "reward": 1.0892857611179352, + "reward_std": 0.19970239885151386, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.964285746216774, + "step": 1817 + }, + { + "completion_length": 872.3281707763672, + "epoch": 0.5430513031140318, + "grad_norm": 2.2682480812072754, + "kl": 0.86328125, + "learning_rate": 5.607942779802292e-07, + "loss": 0.0407, + "reward": 1.0937500596046448, + "reward_std": 0.23986324667930603, + "rewards/accuracy_reward": 0.15848215273581445, + "rewards/format_reward": 0.93526791036129, + "step": 1818 + }, + { + "completion_length": 899.3236846923828, + "epoch": 0.5433500112015532, + "grad_norm": 2.1203255653381348, + "kl": 0.8857421875, + "learning_rate": 5.603250457938489e-07, + "loss": 0.012, + "reward": 1.102678656578064, + "reward_std": 0.267047006636858, + "rewards/accuracy_reward": 0.15401786379516125, + "rewards/format_reward": 0.9486607611179352, + "step": 1819 + }, + { + "completion_length": 930.3862152099609, + "epoch": 0.5436487192890748, + "grad_norm": 1.0832678079605103, + "kl": 0.6953125, + "learning_rate": 5.598558023748307e-07, + "loss": 0.0449, + "reward": 1.0647321939468384, + "reward_std": 0.20788700878620148, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9620536118745804, + "step": 1820 + }, + { + "completion_length": 874.1272583007812, + "epoch": 0.5439474273765962, + "grad_norm": 1.413291335105896, + "kl": 0.83154296875, + "learning_rate": 5.593865482336657e-07, + "loss": 0.0097, + "reward": 1.1562500596046448, + "reward_std": 0.250826433300972, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.948660746216774, + "step": 1821 + }, + { + "completion_length": 949.1339874267578, + "epoch": 0.5442461354641177, + "grad_norm": 1.0768529176712036, + "kl": 0.6337890625, + "learning_rate": 5.589172838808561e-07, + "loss": 0.0425, + "reward": 1.1026786416769028, + "reward_std": 0.24957532435655594, + "rewards/accuracy_reward": 0.15625001350417733, + "rewards/format_reward": 0.9464286118745804, + "step": 1822 + }, + { + "completion_length": 846.6361999511719, + "epoch": 0.5445448435516391, + "grad_norm": 1.0004574060440063, + "kl": 0.6865234375, + "learning_rate": 5.584480098269155e-07, + "loss": 0.0293, + "reward": 1.0959821939468384, + "reward_std": 0.2855517081916332, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.957589328289032, + "step": 1823 + }, + { + "completion_length": 850.5848541259766, + "epoch": 0.5448435516391607, + "grad_norm": 1.7255277633666992, + "kl": 0.69384765625, + "learning_rate": 5.579787265823684e-07, + "loss": 0.0336, + "reward": 1.0736607611179352, + "reward_std": 0.24386529624462128, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9419643431901932, + "step": 1824 + }, + { + "completion_length": 829.9598541259766, + "epoch": 0.5451422597266821, + "grad_norm": 2.2369627952575684, + "kl": 0.650390625, + "learning_rate": 5.575094346577487e-07, + "loss": 0.0271, + "reward": 1.1964286267757416, + "reward_std": 0.3011052832007408, + "rewards/accuracy_reward": 0.2544643022119999, + "rewards/format_reward": 0.941964328289032, + "step": 1825 + }, + { + "completion_length": 844.2388763427734, + "epoch": 0.5454409678142036, + "grad_norm": 2.076406955718994, + "kl": 0.8427734375, + "learning_rate": 5.570401345636e-07, + "loss": 0.0333, + "reward": 1.0000000298023224, + "reward_std": 0.2558807320892811, + "rewards/accuracy_reward": 0.06250000279396772, + "rewards/format_reward": 0.9375000447034836, + "step": 1826 + }, + { + "completion_length": 831.2991333007812, + "epoch": 0.545739675901725, + "grad_norm": 1.0485260486602783, + "kl": 0.728515625, + "learning_rate": 5.56570826810475e-07, + "loss": 0.0492, + "reward": 1.162946492433548, + "reward_std": 0.19730420410633087, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.979910746216774, + "step": 1827 + }, + { + "completion_length": 851.5491333007812, + "epoch": 0.5460383839892465, + "grad_norm": 2.840136766433716, + "kl": 0.791015625, + "learning_rate": 5.561015119089345e-07, + "loss": 0.0167, + "reward": 1.1897321939468384, + "reward_std": 0.21943502500653267, + "rewards/accuracy_reward": 0.232142873108387, + "rewards/format_reward": 0.957589328289032, + "step": 1828 + }, + { + "completion_length": 878.5893096923828, + "epoch": 0.546337092076768, + "grad_norm": 1.2595340013504028, + "kl": 0.697265625, + "learning_rate": 5.556321903695469e-07, + "loss": 0.0314, + "reward": 1.0357143431901932, + "reward_std": 0.1877634972333908, + "rewards/accuracy_reward": 0.07366071734577417, + "rewards/format_reward": 0.9620536118745804, + "step": 1829 + }, + { + "completion_length": 830.8750305175781, + "epoch": 0.5466358001642895, + "grad_norm": 1.0809215307235718, + "kl": 0.5673828125, + "learning_rate": 5.551628627028883e-07, + "loss": 0.0322, + "reward": 1.1763393580913544, + "reward_std": 0.20761772617697716, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.973214328289032, + "step": 1830 + }, + { + "completion_length": 864.4018402099609, + "epoch": 0.5469345082518109, + "grad_norm": 2.1842174530029297, + "kl": 0.623046875, + "learning_rate": 5.546935294195411e-07, + "loss": 0.0149, + "reward": 1.2008928954601288, + "reward_std": 0.28566971607506275, + "rewards/accuracy_reward": 0.2321428619325161, + "rewards/format_reward": 0.9687500298023224, + "step": 1831 + }, + { + "completion_length": 931.1339721679688, + "epoch": 0.5472332163393324, + "grad_norm": 2.1365842819213867, + "kl": 0.7890625, + "learning_rate": 5.542241910300939e-07, + "loss": 0.0228, + "reward": 1.1517857611179352, + "reward_std": 0.23777073435485363, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.964285746216774, + "step": 1832 + }, + { + "completion_length": 791.3527069091797, + "epoch": 0.5475319244268538, + "grad_norm": 1.0039689540863037, + "kl": 0.57666015625, + "learning_rate": 5.537548480451408e-07, + "loss": 0.0397, + "reward": 1.0915178954601288, + "reward_std": 0.23188984766602516, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 1833 + }, + { + "completion_length": 887.216552734375, + "epoch": 0.5478306325143754, + "grad_norm": 1.0610575675964355, + "kl": 0.66796875, + "learning_rate": 5.53285500975281e-07, + "loss": 0.0149, + "reward": 1.0937500596046448, + "reward_std": 0.258739085868001, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9486607611179352, + "step": 1834 + }, + { + "completion_length": 818.7433471679688, + "epoch": 0.5481293406018968, + "grad_norm": 1.8238667249679565, + "kl": 0.3876953125, + "learning_rate": 5.528161503311184e-07, + "loss": 0.0705, + "reward": 1.058035746216774, + "reward_std": 0.24718783050775528, + "rewards/accuracy_reward": 0.09821429150179029, + "rewards/format_reward": 0.9598214775323868, + "step": 1835 + }, + { + "completion_length": 848.0379943847656, + "epoch": 0.5484280486894183, + "grad_norm": 1.538609504699707, + "kl": 0.6240234375, + "learning_rate": 5.5234679662326e-07, + "loss": 0.067, + "reward": 1.13839291036129, + "reward_std": 0.24308280274271965, + "rewards/accuracy_reward": 0.1808035857975483, + "rewards/format_reward": 0.957589328289032, + "step": 1836 + }, + { + "completion_length": 866.4308471679688, + "epoch": 0.5487267567769397, + "grad_norm": 1.9540365934371948, + "kl": 0.4677734375, + "learning_rate": 5.518774403623169e-07, + "loss": 0.0513, + "reward": 1.1004464626312256, + "reward_std": 0.24939172714948654, + "rewards/accuracy_reward": 0.149553582072258, + "rewards/format_reward": 0.9508928954601288, + "step": 1837 + }, + { + "completion_length": 941.6451263427734, + "epoch": 0.5490254648644612, + "grad_norm": 0.7412601113319397, + "kl": 0.345703125, + "learning_rate": 5.514080820589029e-07, + "loss": 0.005, + "reward": 1.0513393431901932, + "reward_std": 0.23193785920739174, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.9620535969734192, + "step": 1838 + }, + { + "completion_length": 911.0201263427734, + "epoch": 0.5493241729519827, + "grad_norm": 1.182922124862671, + "kl": 0.62548828125, + "learning_rate": 5.509387222236336e-07, + "loss": -0.0125, + "reward": 1.1116071939468384, + "reward_std": 0.29072992876172066, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.9464285969734192, + "step": 1839 + }, + { + "completion_length": 846.8594207763672, + "epoch": 0.5496228810395042, + "grad_norm": 1.3188331127166748, + "kl": 0.5576171875, + "learning_rate": 5.504693613671263e-07, + "loss": 0.053, + "reward": 1.1071428954601288, + "reward_std": 0.227452851831913, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9553571790456772, + "step": 1840 + }, + { + "completion_length": 928.0223541259766, + "epoch": 0.5499215891270256, + "grad_norm": 0.8549246788024902, + "kl": 0.587890625, + "learning_rate": 5.5e-07, + "loss": 0.0268, + "reward": 1.0937500447034836, + "reward_std": 0.2003508321940899, + "rewards/accuracy_reward": 0.12946429220028222, + "rewards/format_reward": 0.9642857611179352, + "step": 1841 + }, + { + "completion_length": 870.1518249511719, + "epoch": 0.5502202972145471, + "grad_norm": 0.9399704337120056, + "kl": 0.75830078125, + "learning_rate": 5.495306386328738e-07, + "loss": 0.011, + "reward": 1.0736607760190964, + "reward_std": 0.2391969859600067, + "rewards/accuracy_reward": 0.13392858067527413, + "rewards/format_reward": 0.9397321939468384, + "step": 1842 + }, + { + "completion_length": 806.6786193847656, + "epoch": 0.5505190053020685, + "grad_norm": 1.2997013330459595, + "kl": 0.46142578125, + "learning_rate": 5.490612777763665e-07, + "loss": -0.0254, + "reward": 1.073660746216774, + "reward_std": 0.1898839734494686, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9464286118745804, + "step": 1843 + }, + { + "completion_length": 854.263427734375, + "epoch": 0.5508177133895901, + "grad_norm": 1.0072205066680908, + "kl": 0.74267578125, + "learning_rate": 5.48591917941097e-07, + "loss": 0.0242, + "reward": 1.084821492433548, + "reward_std": 0.22247397527098656, + "rewards/accuracy_reward": 0.12500000931322575, + "rewards/format_reward": 0.9598214626312256, + "step": 1844 + }, + { + "completion_length": 899.4799499511719, + "epoch": 0.5511164214771115, + "grad_norm": 1.1961355209350586, + "kl": 0.60693359375, + "learning_rate": 5.48122559637683e-07, + "loss": 0.005, + "reward": 1.1071428954601288, + "reward_std": 0.17169320583343506, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.973214328289032, + "step": 1845 + }, + { + "completion_length": 875.638427734375, + "epoch": 0.551415129564633, + "grad_norm": 1.1259926557540894, + "kl": 0.53564453125, + "learning_rate": 5.476532033767401e-07, + "loss": 0.0228, + "reward": 1.0825893580913544, + "reward_std": 0.22876619175076485, + "rewards/accuracy_reward": 0.12500000279396772, + "rewards/format_reward": 0.957589328289032, + "step": 1846 + }, + { + "completion_length": 928.9420166015625, + "epoch": 0.5517138376521544, + "grad_norm": 2.0824191570281982, + "kl": 0.87548828125, + "learning_rate": 5.471838496688817e-07, + "loss": -0.0179, + "reward": 1.066964328289032, + "reward_std": 0.17927038297057152, + "rewards/accuracy_reward": 0.11830358067527413, + "rewards/format_reward": 0.948660746216774, + "step": 1847 + }, + { + "completion_length": 788.8103179931641, + "epoch": 0.552012545739676, + "grad_norm": 1.3398946523666382, + "kl": 0.67724609375, + "learning_rate": 5.467144990247189e-07, + "loss": 0.0422, + "reward": 1.0178571790456772, + "reward_std": 0.24105024710297585, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.9486607611179352, + "step": 1848 + }, + { + "completion_length": 861.7232360839844, + "epoch": 0.5523112538271974, + "grad_norm": 0.9732630252838135, + "kl": 0.5693359375, + "learning_rate": 5.462451519548592e-07, + "loss": 0.0179, + "reward": 1.0892857909202576, + "reward_std": 0.2181970402598381, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9598214775323868, + "step": 1849 + }, + { + "completion_length": 907.9955749511719, + "epoch": 0.5526099619147189, + "grad_norm": 1.0123374462127686, + "kl": 0.6318359375, + "learning_rate": 5.45775808969906e-07, + "loss": 0.0226, + "reward": 1.1205357611179352, + "reward_std": 0.2763739414513111, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.9397321939468384, + "step": 1850 + }, + { + "completion_length": 922.0402221679688, + "epoch": 0.5529086700022403, + "grad_norm": 1.256097674369812, + "kl": 0.60595703125, + "learning_rate": 5.453064705804588e-07, + "loss": 0.0292, + "reward": 1.1473214626312256, + "reward_std": 0.24616897478699684, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9575893133878708, + "step": 1851 + }, + { + "completion_length": 911.5536041259766, + "epoch": 0.5532073780897617, + "grad_norm": 1.0946660041809082, + "kl": 0.60205078125, + "learning_rate": 5.448371372971116e-07, + "loss": -0.0161, + "reward": 1.1674107313156128, + "reward_std": 0.27828139811754227, + "rewards/accuracy_reward": 0.20758928917348385, + "rewards/format_reward": 0.9598214775323868, + "step": 1852 + }, + { + "completion_length": 950.435302734375, + "epoch": 0.5535060861772833, + "grad_norm": 2.6242427825927734, + "kl": 0.7783203125, + "learning_rate": 5.44367809630453e-07, + "loss": 0.0308, + "reward": 1.1049107611179352, + "reward_std": 0.22595320269465446, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.9531250447034836, + "step": 1853 + }, + { + "completion_length": 897.8437805175781, + "epoch": 0.5538047942648047, + "grad_norm": 1.247045636177063, + "kl": 0.5458984375, + "learning_rate": 5.438984880910656e-07, + "loss": 0.0369, + "reward": 1.2008929252624512, + "reward_std": 0.2679678499698639, + "rewards/accuracy_reward": 0.2477678693830967, + "rewards/format_reward": 0.9531250447034836, + "step": 1854 + }, + { + "completion_length": 927.9598693847656, + "epoch": 0.5541035023523262, + "grad_norm": 1.0306928157806396, + "kl": 0.456787109375, + "learning_rate": 5.43429173189525e-07, + "loss": 0.0503, + "reward": 1.1316964626312256, + "reward_std": 0.2680254504084587, + "rewards/accuracy_reward": 0.17857143469154835, + "rewards/format_reward": 0.9531250298023224, + "step": 1855 + }, + { + "completion_length": 926.2433471679688, + "epoch": 0.5544022104398476, + "grad_norm": 1.321089744567871, + "kl": 0.58935546875, + "learning_rate": 5.429598654364e-07, + "loss": 0.0617, + "reward": 1.0066964775323868, + "reward_std": 0.17695712484419346, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.9598214775323868, + "step": 1856 + }, + { + "completion_length": 939.0290679931641, + "epoch": 0.5547009185273691, + "grad_norm": 1.1876355409622192, + "kl": 0.6572265625, + "learning_rate": 5.424905653422514e-07, + "loss": 0.0358, + "reward": 1.1116072088479996, + "reward_std": 0.2772331014275551, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9508928954601288, + "step": 1857 + }, + { + "completion_length": 921.8460235595703, + "epoch": 0.5549996266148906, + "grad_norm": 1.0950889587402344, + "kl": 0.58935546875, + "learning_rate": 5.420212734176315e-07, + "loss": -0.0115, + "reward": 1.1540178954601288, + "reward_std": 0.24741074815392494, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9531250447034836, + "step": 1858 + }, + { + "completion_length": 951.7745971679688, + "epoch": 0.5552983347024121, + "grad_norm": 1.3218923807144165, + "kl": 0.7216796875, + "learning_rate": 5.415519901730845e-07, + "loss": 0.0425, + "reward": 1.1026785969734192, + "reward_std": 0.2710876688361168, + "rewards/accuracy_reward": 0.1473214328289032, + "rewards/format_reward": 0.9553571790456772, + "step": 1859 + }, + { + "completion_length": 938.2567443847656, + "epoch": 0.5555970427899335, + "grad_norm": 0.819341242313385, + "kl": 0.5947265625, + "learning_rate": 5.410827161191441e-07, + "loss": 0.016, + "reward": 1.0669643580913544, + "reward_std": 0.21903004869818687, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.9598214775323868, + "step": 1860 + }, + { + "completion_length": 874.5692291259766, + "epoch": 0.555895750877455, + "grad_norm": 2.0838611125946045, + "kl": 0.6748046875, + "learning_rate": 5.406134517663344e-07, + "loss": 0.0277, + "reward": 1.1718750596046448, + "reward_std": 0.30620741471648216, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9486607611179352, + "step": 1861 + }, + { + "completion_length": 882.4576110839844, + "epoch": 0.5561944589649764, + "grad_norm": 1.2572966814041138, + "kl": 0.85546875, + "learning_rate": 5.401441976251691e-07, + "loss": 0.0477, + "reward": 1.10714291036129, + "reward_std": 0.2652099244296551, + "rewards/accuracy_reward": 0.16517857694998384, + "rewards/format_reward": 0.941964328289032, + "step": 1862 + }, + { + "completion_length": 847.8861999511719, + "epoch": 0.556493167052498, + "grad_norm": 1.6828213930130005, + "kl": 0.466064453125, + "learning_rate": 5.396749542061512e-07, + "loss": 0.0136, + "reward": 1.0580357909202576, + "reward_std": 0.22779106721282005, + "rewards/accuracy_reward": 0.08705357648432255, + "rewards/format_reward": 0.9709821790456772, + "step": 1863 + }, + { + "completion_length": 1027.2768249511719, + "epoch": 0.5567918751400194, + "grad_norm": 1.2140836715698242, + "kl": 0.8203125, + "learning_rate": 5.392057220197708e-07, + "loss": 0.0456, + "reward": 1.0870536118745804, + "reward_std": 0.257246196269989, + "rewards/accuracy_reward": 0.13839286612346768, + "rewards/format_reward": 0.9486607611179352, + "step": 1864 + }, + { + "completion_length": 953.6205749511719, + "epoch": 0.5570905832275409, + "grad_norm": 2.5467004776000977, + "kl": 0.9794921875, + "learning_rate": 5.387365015765071e-07, + "loss": -0.005, + "reward": 1.082589328289032, + "reward_std": 0.2134163249284029, + "rewards/accuracy_reward": 0.13392857694998384, + "rewards/format_reward": 0.9486607611179352, + "step": 1865 + }, + { + "completion_length": 860.0692138671875, + "epoch": 0.5573892913150623, + "grad_norm": 2.032801628112793, + "kl": 0.7919921875, + "learning_rate": 5.382672933868258e-07, + "loss": 0.0431, + "reward": 1.1183036267757416, + "reward_std": 0.24565836414694786, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.9620535969734192, + "step": 1866 + }, + { + "completion_length": 903.872802734375, + "epoch": 0.5576879994025838, + "grad_norm": 1.3699346780776978, + "kl": 0.8232421875, + "learning_rate": 5.377980979611792e-07, + "loss": 0.0447, + "reward": 1.2120536267757416, + "reward_std": 0.25338228419423103, + "rewards/accuracy_reward": 0.25669644493609667, + "rewards/format_reward": 0.9553571939468384, + "step": 1867 + }, + { + "completion_length": 864.3973846435547, + "epoch": 0.5579867074901053, + "grad_norm": 2.1650938987731934, + "kl": 0.75390625, + "learning_rate": 5.373289158100061e-07, + "loss": 0.0106, + "reward": 1.0535714775323868, + "reward_std": 0.21563256904482841, + "rewards/accuracy_reward": 0.09375000302679837, + "rewards/format_reward": 0.9598214626312256, + "step": 1868 + }, + { + "completion_length": 913.6027069091797, + "epoch": 0.5582854155776268, + "grad_norm": 1.1921664476394653, + "kl": 0.76806640625, + "learning_rate": 5.368597474437308e-07, + "loss": 0.0167, + "reward": 1.0959821939468384, + "reward_std": 0.14926199987530708, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.9754464477300644, + "step": 1869 + }, + { + "completion_length": 963.2500305175781, + "epoch": 0.5585841236651482, + "grad_norm": 1.761594533920288, + "kl": 0.9052734375, + "learning_rate": 5.363905933727624e-07, + "loss": 0.0171, + "reward": 1.082589328289032, + "reward_std": 0.29214484617114067, + "rewards/accuracy_reward": 0.14285714854486287, + "rewards/format_reward": 0.9397321790456772, + "step": 1870 + }, + { + "completion_length": 927.8036193847656, + "epoch": 0.5588828317526697, + "grad_norm": 1.2523529529571533, + "kl": 0.953125, + "learning_rate": 5.359214541074943e-07, + "loss": -0.0165, + "reward": 1.0513393431901932, + "reward_std": 0.29256028681993484, + "rewards/accuracy_reward": 0.11383928847499192, + "rewards/format_reward": 0.9375000298023224, + "step": 1871 + }, + { + "completion_length": 970.3259429931641, + "epoch": 0.5591815398401911, + "grad_norm": 1.4169039726257324, + "kl": 0.9296875, + "learning_rate": 5.354523301583046e-07, + "loss": 0.0172, + "reward": 1.051339328289032, + "reward_std": 0.19707737118005753, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.9620535969734192, + "step": 1872 + }, + { + "completion_length": 848.6808319091797, + "epoch": 0.5594802479277127, + "grad_norm": 1.131082534790039, + "kl": 0.869140625, + "learning_rate": 5.349832220355537e-07, + "loss": 0.0225, + "reward": 1.0803571492433548, + "reward_std": 0.24248141422867775, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.9375000298023224, + "step": 1873 + }, + { + "completion_length": 879.8393249511719, + "epoch": 0.5597789560152341, + "grad_norm": 0.9574275612831116, + "kl": 0.705078125, + "learning_rate": 5.345141302495855e-07, + "loss": 0.0131, + "reward": 1.1093750596046448, + "reward_std": 0.22995072603225708, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9486607611179352, + "step": 1874 + }, + { + "completion_length": 925.4509429931641, + "epoch": 0.5600776641027556, + "grad_norm": 1.0686986446380615, + "kl": 0.7666015625, + "learning_rate": 5.340450553107257e-07, + "loss": 0.0536, + "reward": 1.1183035969734192, + "reward_std": 0.2986510433256626, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.9508928805589676, + "step": 1875 + }, + { + "completion_length": 989.8437805175781, + "epoch": 0.560376372190277, + "grad_norm": 1.2044295072555542, + "kl": 0.765625, + "learning_rate": 5.335759977292821e-07, + "loss": 0.0091, + "reward": 1.084821492433548, + "reward_std": 0.26318193413317204, + "rewards/accuracy_reward": 0.14955357555299997, + "rewards/format_reward": 0.93526791036129, + "step": 1876 + }, + { + "completion_length": 986.1585388183594, + "epoch": 0.5606750802777986, + "grad_norm": 1.3394064903259277, + "kl": 0.9267578125, + "learning_rate": 5.331069580155431e-07, + "loss": 0.0231, + "reward": 1.145089328289032, + "reward_std": 0.2233234867453575, + "rewards/accuracy_reward": 0.1897321566939354, + "rewards/format_reward": 0.9553571790456772, + "step": 1877 + }, + { + "completion_length": 978.7634582519531, + "epoch": 0.56097378836532, + "grad_norm": 1.2240197658538818, + "kl": 0.6708984375, + "learning_rate": 5.326379366797782e-07, + "loss": 0.0326, + "reward": 1.0803571790456772, + "reward_std": 0.244199488312006, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9553571939468384, + "step": 1878 + }, + { + "completion_length": 983.9687805175781, + "epoch": 0.5612724964528415, + "grad_norm": 0.8278268575668335, + "kl": 0.52197265625, + "learning_rate": 5.321689342322366e-07, + "loss": 0.0016, + "reward": 1.1227678805589676, + "reward_std": 0.24735751748085022, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9620536267757416, + "step": 1879 + }, + { + "completion_length": 953.8437957763672, + "epoch": 0.5615712045403629, + "grad_norm": 147.96011352539062, + "kl": 5.92333984375, + "learning_rate": 5.316999511831468e-07, + "loss": 0.227, + "reward": 1.116071492433548, + "reward_std": 0.275379441678524, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9553571790456772, + "step": 1880 + }, + { + "completion_length": 917.9241485595703, + "epoch": 0.5618699126278844, + "grad_norm": 0.763617217540741, + "kl": 0.476318359375, + "learning_rate": 5.312309880427166e-07, + "loss": 0.0094, + "reward": 1.145089328289032, + "reward_std": 0.20532264187932014, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.9799107611179352, + "step": 1881 + }, + { + "completion_length": 1059.1072082519531, + "epoch": 0.5621686207154059, + "grad_norm": 2.63852858543396, + "kl": 0.508056640625, + "learning_rate": 5.307620453211317e-07, + "loss": 0.0629, + "reward": 1.147321492433548, + "reward_std": 0.2890402004122734, + "rewards/accuracy_reward": 0.20312500838190317, + "rewards/format_reward": 0.9441964775323868, + "step": 1882 + }, + { + "completion_length": 1012.2232818603516, + "epoch": 0.5624673288029274, + "grad_norm": 0.6220883727073669, + "kl": 0.4248046875, + "learning_rate": 5.30293123528556e-07, + "loss": 0.0121, + "reward": 1.2321429252624512, + "reward_std": 0.24968013539910316, + "rewards/accuracy_reward": 0.27008929662406445, + "rewards/format_reward": 0.9620535969734192, + "step": 1883 + }, + { + "completion_length": 1037.3973693847656, + "epoch": 0.5627660368904488, + "grad_norm": 1.4711312055587769, + "kl": 0.4892578125, + "learning_rate": 5.298242231751305e-07, + "loss": 0.0307, + "reward": 1.1562500596046448, + "reward_std": 0.22969994693994522, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.96651791036129, + "step": 1884 + }, + { + "completion_length": 1018.1362152099609, + "epoch": 0.5630647449779703, + "grad_norm": 0.8354600071907043, + "kl": 0.3984375, + "learning_rate": 5.293553447709729e-07, + "loss": 0.0202, + "reward": 1.0892857611179352, + "reward_std": 0.28779512271285057, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.9531250596046448, + "step": 1885 + }, + { + "completion_length": 928.4085235595703, + "epoch": 0.5633634530654917, + "grad_norm": 1.6675881147384644, + "kl": 0.57080078125, + "learning_rate": 5.288864888261765e-07, + "loss": 0.0502, + "reward": 1.1540178954601288, + "reward_std": 0.248235784471035, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.9441964626312256, + "step": 1886 + }, + { + "completion_length": 983.9263763427734, + "epoch": 0.5636621611530133, + "grad_norm": 1.5863293409347534, + "kl": 0.5498046875, + "learning_rate": 5.28417655850811e-07, + "loss": 0.006, + "reward": 1.0937500298023224, + "reward_std": 0.24910276010632515, + "rewards/accuracy_reward": 0.1517857164144516, + "rewards/format_reward": 0.941964328289032, + "step": 1887 + }, + { + "completion_length": 949.0089721679688, + "epoch": 0.5639608692405347, + "grad_norm": 0.9393160939216614, + "kl": 0.43505859375, + "learning_rate": 5.279488463549208e-07, + "loss": 0.0328, + "reward": 1.0781250894069672, + "reward_std": 0.27332741767168045, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.957589328289032, + "step": 1888 + }, + { + "completion_length": 937.8616485595703, + "epoch": 0.5642595773280562, + "grad_norm": 0.886673092842102, + "kl": 0.40185546875, + "learning_rate": 5.274800608485243e-07, + "loss": 0.0141, + "reward": 1.0982143580913544, + "reward_std": 0.21927303820848465, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.957589328289032, + "step": 1889 + }, + { + "completion_length": 949.841552734375, + "epoch": 0.5645582854155776, + "grad_norm": 1.1789082288742065, + "kl": 0.4384765625, + "learning_rate": 5.270112998416145e-07, + "loss": 0.0384, + "reward": 1.1808036416769028, + "reward_std": 0.23338325694203377, + "rewards/accuracy_reward": 0.22767858440056443, + "rewards/format_reward": 0.9531250447034836, + "step": 1890 + }, + { + "completion_length": 974.9888763427734, + "epoch": 0.5648569935030991, + "grad_norm": 1.4656221866607666, + "kl": 0.52490234375, + "learning_rate": 5.265425638441574e-07, + "loss": 0.0236, + "reward": 1.0781250596046448, + "reward_std": 0.26934410259127617, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9531250447034836, + "step": 1891 + }, + { + "completion_length": 966.9129791259766, + "epoch": 0.5651557015906206, + "grad_norm": 1.1365301609039307, + "kl": 0.383056640625, + "learning_rate": 5.260738533660917e-07, + "loss": 0.0295, + "reward": 1.113839328289032, + "reward_std": 0.21858960762619972, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9441964775323868, + "step": 1892 + }, + { + "completion_length": 862.6161041259766, + "epoch": 0.5654544096781421, + "grad_norm": 1.3989953994750977, + "kl": 0.45947265625, + "learning_rate": 5.256051689173284e-07, + "loss": 0.0122, + "reward": 1.1495536267757416, + "reward_std": 0.23902100697159767, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9575893133878708, + "step": 1893 + }, + { + "completion_length": 955.2009582519531, + "epoch": 0.5657531177656635, + "grad_norm": 1.208383321762085, + "kl": 0.59033203125, + "learning_rate": 5.251365110077506e-07, + "loss": 0.0369, + "reward": 1.194196492433548, + "reward_std": 0.277606263756752, + "rewards/accuracy_reward": 0.2343750074505806, + "rewards/format_reward": 0.9598214477300644, + "step": 1894 + }, + { + "completion_length": 878.091552734375, + "epoch": 0.5660518258531849, + "grad_norm": 0.9722246527671814, + "kl": 0.31298828125, + "learning_rate": 5.246678801472118e-07, + "loss": 0.0001, + "reward": 1.116071492433548, + "reward_std": 0.29690831527113914, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9531250596046448, + "step": 1895 + }, + { + "completion_length": 989.6406707763672, + "epoch": 0.5663505339407064, + "grad_norm": 1.180966854095459, + "kl": 0.5732421875, + "learning_rate": 5.241992768455366e-07, + "loss": 0.0404, + "reward": 1.1227678954601288, + "reward_std": 0.21003706753253937, + "rewards/accuracy_reward": 0.1785714398138225, + "rewards/format_reward": 0.9441964775323868, + "step": 1896 + }, + { + "completion_length": 895.1897735595703, + "epoch": 0.5666492420282279, + "grad_norm": 0.8785085678100586, + "kl": 0.50439453125, + "learning_rate": 5.237307016125194e-07, + "loss": 0.0153, + "reward": 1.0178571939468384, + "reward_std": 0.16107439063489437, + "rewards/accuracy_reward": 0.049107146449387074, + "rewards/format_reward": 0.9687500447034836, + "step": 1897 + }, + { + "completion_length": 893.7388763427734, + "epoch": 0.5669479501157494, + "grad_norm": 0.7558185458183289, + "kl": 0.48486328125, + "learning_rate": 5.232621549579242e-07, + "loss": 0.0276, + "reward": 1.1183036267757416, + "reward_std": 0.2561146095395088, + "rewards/accuracy_reward": 0.16964286006987095, + "rewards/format_reward": 0.9486607760190964, + "step": 1898 + }, + { + "completion_length": 878.8817291259766, + "epoch": 0.5672466582032708, + "grad_norm": 1.0723341703414917, + "kl": 0.49853515625, + "learning_rate": 5.22793637391484e-07, + "loss": 0.0281, + "reward": 1.0468750298023224, + "reward_std": 0.2755284942686558, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.9486607611179352, + "step": 1899 + }, + { + "completion_length": 856.2902069091797, + "epoch": 0.5675453662907923, + "grad_norm": 1.5211812257766724, + "kl": 0.5341796875, + "learning_rate": 5.223251494228995e-07, + "loss": 0.0519, + "reward": 1.1361607909202576, + "reward_std": 0.2653684467077255, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.9464286267757416, + "step": 1900 + }, + { + "completion_length": 945.5357666015625, + "epoch": 0.5678440743783137, + "grad_norm": 1.2516984939575195, + "kl": 0.45849609375, + "learning_rate": 5.218566915618402e-07, + "loss": 0.0116, + "reward": 1.0602678954601288, + "reward_std": 0.2519943341612816, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9330357760190964, + "step": 1901 + }, + { + "completion_length": 824.8192291259766, + "epoch": 0.5681427824658353, + "grad_norm": 1.4531617164611816, + "kl": 0.505615234375, + "learning_rate": 5.213882643179422e-07, + "loss": 0.0224, + "reward": 1.1071428954601288, + "reward_std": 0.31537188589572906, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9441964626312256, + "step": 1902 + }, + { + "completion_length": 887.529052734375, + "epoch": 0.5684414905533567, + "grad_norm": 1.0884779691696167, + "kl": 0.43115234375, + "learning_rate": 5.209198682008081e-07, + "loss": 0.0192, + "reward": 1.04464291036129, + "reward_std": 0.300282821059227, + "rewards/accuracy_reward": 0.11160714831203222, + "rewards/format_reward": 0.933035746216774, + "step": 1903 + }, + { + "completion_length": 899.5268402099609, + "epoch": 0.5687401986408782, + "grad_norm": 1.2067142724990845, + "kl": 0.57177734375, + "learning_rate": 5.204515037200074e-07, + "loss": 0.0359, + "reward": 1.1696428954601288, + "reward_std": 0.18028504215180874, + "rewards/accuracy_reward": 0.2031250037252903, + "rewards/format_reward": 0.96651791036129, + "step": 1904 + }, + { + "completion_length": 827.3705749511719, + "epoch": 0.5690389067283996, + "grad_norm": 0.9484045505523682, + "kl": 0.50537109375, + "learning_rate": 5.199831713850748e-07, + "loss": 0.0455, + "reward": 1.1272321939468384, + "reward_std": 0.29428427666425705, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9464286118745804, + "step": 1905 + }, + { + "completion_length": 829.122802734375, + "epoch": 0.5693376148159212, + "grad_norm": 0.7334259152412415, + "kl": 0.34765625, + "learning_rate": 5.195148717055094e-07, + "loss": 0.0283, + "reward": 1.10714291036129, + "reward_std": 0.17418322525918484, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.9687500447034836, + "step": 1906 + }, + { + "completion_length": 859.0491485595703, + "epoch": 0.5696363229034426, + "grad_norm": 1.5277962684631348, + "kl": 0.51416015625, + "learning_rate": 5.19046605190776e-07, + "loss": 0.0124, + "reward": 1.0625000596046448, + "reward_std": 0.15935256704688072, + "rewards/accuracy_reward": 0.08705357601866126, + "rewards/format_reward": 0.9754464775323868, + "step": 1907 + }, + { + "completion_length": 892.6004791259766, + "epoch": 0.5699350309909641, + "grad_norm": 1.3335464000701904, + "kl": 0.4580078125, + "learning_rate": 5.185783723503022e-07, + "loss": 0.0088, + "reward": 1.1741071939468384, + "reward_std": 0.2861950695514679, + "rewards/accuracy_reward": 0.2209821566939354, + "rewards/format_reward": 0.9531250298023224, + "step": 1908 + }, + { + "completion_length": 994.8572082519531, + "epoch": 0.5702337390784855, + "grad_norm": 0.6951137781143188, + "kl": 0.4453125, + "learning_rate": 5.181101736934798e-07, + "loss": 0.0192, + "reward": 1.0558036267757416, + "reward_std": 0.2849661335349083, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.9575893133878708, + "step": 1909 + }, + { + "completion_length": 840.2857360839844, + "epoch": 0.570532447166007, + "grad_norm": 1.5715574026107788, + "kl": 0.69970703125, + "learning_rate": 5.176420097296627e-07, + "loss": 0.0505, + "reward": 1.1651786267757416, + "reward_std": 0.254564069211483, + "rewards/accuracy_reward": 0.1986607275903225, + "rewards/format_reward": 0.9665178954601288, + "step": 1910 + }, + { + "completion_length": 923.497802734375, + "epoch": 0.5708311552535285, + "grad_norm": 1.7845114469528198, + "kl": 0.317626953125, + "learning_rate": 5.171738809681677e-07, + "loss": -0.0017, + "reward": 1.160714328289032, + "reward_std": 0.21663515642285347, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9620536118745804, + "step": 1911 + }, + { + "completion_length": 920.3370819091797, + "epoch": 0.57112986334105, + "grad_norm": 1.2312778234481812, + "kl": 0.36474609375, + "learning_rate": 5.167057879182729e-07, + "loss": 0.0041, + "reward": 1.1785714328289032, + "reward_std": 0.2608230262994766, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.96651791036129, + "step": 1912 + }, + { + "completion_length": 867.4844207763672, + "epoch": 0.5714285714285714, + "grad_norm": 0.9288734793663025, + "kl": 0.412109375, + "learning_rate": 5.162377310892177e-07, + "loss": 0.0474, + "reward": 1.1607143580913544, + "reward_std": 0.28049543127417564, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9687500298023224, + "step": 1913 + }, + { + "completion_length": 860.3995819091797, + "epoch": 0.5717272795160929, + "grad_norm": 1.485948920249939, + "kl": 0.334228515625, + "learning_rate": 5.15769710990202e-07, + "loss": 0.0261, + "reward": 1.2276786267757416, + "reward_std": 0.3067485950887203, + "rewards/accuracy_reward": 0.2566964402794838, + "rewards/format_reward": 0.9709821790456772, + "step": 1914 + }, + { + "completion_length": 916.7678985595703, + "epoch": 0.5720259876036143, + "grad_norm": 2.0147905349731445, + "kl": 0.47509765625, + "learning_rate": 5.153017281303858e-07, + "loss": 0.0482, + "reward": 1.1450892984867096, + "reward_std": 0.2283257581293583, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.957589328289032, + "step": 1915 + }, + { + "completion_length": 861.7745971679688, + "epoch": 0.5723246956911359, + "grad_norm": 1.3033015727996826, + "kl": 0.4345703125, + "learning_rate": 5.148337830188885e-07, + "loss": 0.0071, + "reward": 1.1450893580913544, + "reward_std": 0.22009466588497162, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9553571790456772, + "step": 1916 + }, + { + "completion_length": 861.2366485595703, + "epoch": 0.5726234037786573, + "grad_norm": 1.4130220413208008, + "kl": 0.3642578125, + "learning_rate": 5.143658761647887e-07, + "loss": 0.0289, + "reward": 1.1674107611179352, + "reward_std": 0.1871972419321537, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9709821790456772, + "step": 1917 + }, + { + "completion_length": 883.1964874267578, + "epoch": 0.5729221118661788, + "grad_norm": 0.7446038722991943, + "kl": 0.453125, + "learning_rate": 5.13898008077123e-07, + "loss": -0.0125, + "reward": 1.0334821939468384, + "reward_std": 0.16733261570334435, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.9732143133878708, + "step": 1918 + }, + { + "completion_length": 883.4643249511719, + "epoch": 0.5732208199537002, + "grad_norm": 0.8829500675201416, + "kl": 0.4326171875, + "learning_rate": 5.134301792648864e-07, + "loss": 0.0436, + "reward": 1.1183035969734192, + "reward_std": 0.2672489993274212, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.9531250447034836, + "step": 1919 + }, + { + "completion_length": 840.6562805175781, + "epoch": 0.5735195280412217, + "grad_norm": 1.7534878253936768, + "kl": 0.564453125, + "learning_rate": 5.129623902370304e-07, + "loss": 0.0406, + "reward": 1.1316964775323868, + "reward_std": 0.2856307066977024, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9441964626312256, + "step": 1920 + }, + { + "completion_length": 857.4955902099609, + "epoch": 0.5738182361287432, + "grad_norm": 1.1005090475082397, + "kl": 0.4375, + "learning_rate": 5.124946415024635e-07, + "loss": 0.0203, + "reward": 1.0892857760190964, + "reward_std": 0.18083476461470127, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.9687500447034836, + "step": 1921 + }, + { + "completion_length": 909.2165679931641, + "epoch": 0.5741169442162647, + "grad_norm": 0.9011802077293396, + "kl": 0.621826171875, + "learning_rate": 5.120269335700511e-07, + "loss": 0.0092, + "reward": 1.1696428954601288, + "reward_std": 0.27668147161602974, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.9553571790456772, + "step": 1922 + }, + { + "completion_length": 828.513427734375, + "epoch": 0.5744156523037861, + "grad_norm": 0.7158904075622559, + "kl": 0.49365234375, + "learning_rate": 5.115592669486131e-07, + "loss": 0.0241, + "reward": 1.2299107611179352, + "reward_std": 0.19756145402789116, + "rewards/accuracy_reward": 0.27455358020961285, + "rewards/format_reward": 0.9553571790456772, + "step": 1923 + }, + { + "completion_length": 899.2277221679688, + "epoch": 0.5747143603913076, + "grad_norm": 2.5098941326141357, + "kl": 0.564453125, + "learning_rate": 5.110916421469249e-07, + "loss": 0.0173, + "reward": 1.1808036267757416, + "reward_std": 0.21681205928325653, + "rewards/accuracy_reward": 0.21651786286383867, + "rewards/format_reward": 0.9642857611179352, + "step": 1924 + }, + { + "completion_length": 834.3214721679688, + "epoch": 0.575013068478829, + "grad_norm": 2.2395431995391846, + "kl": 0.693359375, + "learning_rate": 5.106240596737168e-07, + "loss": 0.0263, + "reward": 1.1584822237491608, + "reward_std": 0.18522927537560463, + "rewards/accuracy_reward": 0.1941964328289032, + "rewards/format_reward": 0.9642857611179352, + "step": 1925 + }, + { + "completion_length": 779.9129791259766, + "epoch": 0.5753117765663506, + "grad_norm": 1.7085789442062378, + "kl": 0.552734375, + "learning_rate": 5.101565200376725e-07, + "loss": 0.0183, + "reward": 1.1383929550647736, + "reward_std": 0.23160428553819656, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9776786118745804, + "step": 1926 + }, + { + "completion_length": 800.3928833007812, + "epoch": 0.575610484653872, + "grad_norm": 1.44102942943573, + "kl": 0.615234375, + "learning_rate": 5.096890237474292e-07, + "loss": 0.0336, + "reward": 1.0870535969734192, + "reward_std": 0.21643086895346642, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.9687500447034836, + "step": 1927 + }, + { + "completion_length": 925.7054138183594, + "epoch": 0.5759091927413935, + "grad_norm": 1.1909968852996826, + "kl": 0.6171875, + "learning_rate": 5.092215713115772e-07, + "loss": 0.0262, + "reward": 1.0736607611179352, + "reward_std": 0.2947627604007721, + "rewards/accuracy_reward": 0.14285714970901608, + "rewards/format_reward": 0.9308036118745804, + "step": 1928 + }, + { + "completion_length": 836.138427734375, + "epoch": 0.5762079008289149, + "grad_norm": 2.046005964279175, + "kl": 0.66748046875, + "learning_rate": 5.08754163238659e-07, + "loss": 0.022, + "reward": 1.0825893580913544, + "reward_std": 0.2344849407672882, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.957589328289032, + "step": 1929 + }, + { + "completion_length": 851.1384124755859, + "epoch": 0.5765066089164365, + "grad_norm": 1.7527658939361572, + "kl": 0.4638671875, + "learning_rate": 5.082868000371686e-07, + "loss": 0.0224, + "reward": 1.0892857611179352, + "reward_std": 0.269708463922143, + "rewards/accuracy_reward": 0.14732143399305642, + "rewards/format_reward": 0.941964328289032, + "step": 1930 + }, + { + "completion_length": 925.6228179931641, + "epoch": 0.5768053170039579, + "grad_norm": 1.176780104637146, + "kl": 0.4658203125, + "learning_rate": 5.078194822155513e-07, + "loss": 0.0088, + "reward": 1.0513393133878708, + "reward_std": 0.1651557395234704, + "rewards/accuracy_reward": 0.08258929196745157, + "rewards/format_reward": 0.9687500447034836, + "step": 1931 + }, + { + "completion_length": 933.2031555175781, + "epoch": 0.5771040250914794, + "grad_norm": 1.19839346408844, + "kl": 0.4521484375, + "learning_rate": 5.073522102822034e-07, + "loss": 0.0215, + "reward": 1.2366071939468384, + "reward_std": 0.2826160527765751, + "rewards/accuracy_reward": 0.2767857313156128, + "rewards/format_reward": 0.9598214626312256, + "step": 1932 + }, + { + "completion_length": 796.5469055175781, + "epoch": 0.5774027331790008, + "grad_norm": 0.6728332042694092, + "kl": 0.43603515625, + "learning_rate": 5.068849847454709e-07, + "loss": 0.0185, + "reward": 1.2075893580913544, + "reward_std": 0.2032300904393196, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.9709821790456772, + "step": 1933 + }, + { + "completion_length": 844.294677734375, + "epoch": 0.5777014412665223, + "grad_norm": 1.4978336095809937, + "kl": 0.4892578125, + "learning_rate": 5.06417806113649e-07, + "loss": 0.0446, + "reward": 1.0937500596046448, + "reward_std": 0.22108405455946922, + "rewards/accuracy_reward": 0.13169643026776612, + "rewards/format_reward": 0.9620536267757416, + "step": 1934 + }, + { + "completion_length": 791.2076110839844, + "epoch": 0.5780001493540438, + "grad_norm": 0.8485219478607178, + "kl": 0.372314453125, + "learning_rate": 5.059506748949825e-07, + "loss": -0.0104, + "reward": 1.0892857611179352, + "reward_std": 0.2406378611922264, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9486607611179352, + "step": 1935 + }, + { + "completion_length": 918.4933471679688, + "epoch": 0.5782988574415653, + "grad_norm": 1.2757697105407715, + "kl": 0.45361328125, + "learning_rate": 5.054835915976646e-07, + "loss": 0.0118, + "reward": 1.2053571939468384, + "reward_std": 0.3170680105686188, + "rewards/accuracy_reward": 0.27008929289877415, + "rewards/format_reward": 0.93526791036129, + "step": 1936 + }, + { + "completion_length": 910.4308471679688, + "epoch": 0.5785975655290867, + "grad_norm": 0.7874536514282227, + "kl": 0.34765625, + "learning_rate": 5.05016556729836e-07, + "loss": 0.0174, + "reward": 1.0736607611179352, + "reward_std": 0.18525423295795918, + "rewards/accuracy_reward": 0.09598215110599995, + "rewards/format_reward": 0.9776786118745804, + "step": 1937 + }, + { + "completion_length": 828.716552734375, + "epoch": 0.5788962736166081, + "grad_norm": 1.6602522134780884, + "kl": 0.28955078125, + "learning_rate": 5.045495707995847e-07, + "loss": 0.005, + "reward": 1.1785714626312256, + "reward_std": 0.19416319578886032, + "rewards/accuracy_reward": 0.19642858300358057, + "rewards/format_reward": 0.9821428954601288, + "step": 1938 + }, + { + "completion_length": 852.8638916015625, + "epoch": 0.5791949817041296, + "grad_norm": 1.1119974851608276, + "kl": 0.430419921875, + "learning_rate": 5.040826343149458e-07, + "loss": 0.0335, + "reward": 1.1562500894069672, + "reward_std": 0.26864152029156685, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9620536267757416, + "step": 1939 + }, + { + "completion_length": 810.0781707763672, + "epoch": 0.5794936897916511, + "grad_norm": 2.0744943618774414, + "kl": 0.458984375, + "learning_rate": 5.036157477839004e-07, + "loss": 0.0115, + "reward": 1.125000074505806, + "reward_std": 0.32114190235733986, + "rewards/accuracy_reward": 0.18303572502918541, + "rewards/format_reward": 0.9419643133878708, + "step": 1940 + }, + { + "completion_length": 847.4375457763672, + "epoch": 0.5797923978791726, + "grad_norm": 0.9263353943824768, + "kl": 0.3857421875, + "learning_rate": 5.031489117143753e-07, + "loss": 0.0263, + "reward": 1.1540179550647736, + "reward_std": 0.2936001308262348, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9709821790456772, + "step": 1941 + }, + { + "completion_length": 809.8192291259766, + "epoch": 0.580091105966694, + "grad_norm": 1.1823790073394775, + "kl": 0.36767578125, + "learning_rate": 5.026821266142422e-07, + "loss": 0.0051, + "reward": 1.2477679550647736, + "reward_std": 0.3115636333823204, + "rewards/accuracy_reward": 0.30133928917348385, + "rewards/format_reward": 0.9464286118745804, + "step": 1942 + }, + { + "completion_length": 798.5000305175781, + "epoch": 0.5803898140542155, + "grad_norm": 0.9117213487625122, + "kl": 0.33935546875, + "learning_rate": 5.022153929913178e-07, + "loss": 0.0361, + "reward": 1.2053571939468384, + "reward_std": 0.1901232823729515, + "rewards/accuracy_reward": 0.23214286379516125, + "rewards/format_reward": 0.973214328289032, + "step": 1943 + }, + { + "completion_length": 839.3170013427734, + "epoch": 0.5806885221417369, + "grad_norm": 0.6719172596931458, + "kl": 0.326171875, + "learning_rate": 5.017487113533625e-07, + "loss": 0.0161, + "reward": 1.0691964626312256, + "reward_std": 0.19858355075120926, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.9843750447034836, + "step": 1944 + }, + { + "completion_length": 869.4576263427734, + "epoch": 0.5809872302292585, + "grad_norm": 1.1811107397079468, + "kl": 0.3486328125, + "learning_rate": 5.012820822080799e-07, + "loss": 0.0086, + "reward": 1.1227678954601288, + "reward_std": 0.20468377135694027, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.9754464626312256, + "step": 1945 + }, + { + "completion_length": 842.3951263427734, + "epoch": 0.5812859383167799, + "grad_norm": 1.2572861909866333, + "kl": 0.3759765625, + "learning_rate": 5.008155060631171e-07, + "loss": 0.0035, + "reward": 1.162946492433548, + "reward_std": 0.23524178192019463, + "rewards/accuracy_reward": 0.19866072200238705, + "rewards/format_reward": 0.9642857611179352, + "step": 1946 + }, + { + "completion_length": 810.4397735595703, + "epoch": 0.5815846464043014, + "grad_norm": 1.670554280281067, + "kl": 0.39794921875, + "learning_rate": 5.003489834260631e-07, + "loss": 0.0058, + "reward": 1.1428571939468384, + "reward_std": 0.2070111222565174, + "rewards/accuracy_reward": 0.17857143841683865, + "rewards/format_reward": 0.964285746216774, + "step": 1947 + }, + { + "completion_length": 765.9955749511719, + "epoch": 0.5818833544918228, + "grad_norm": 1.3481606245040894, + "kl": 0.454345703125, + "learning_rate": 4.99882514804449e-07, + "loss": 0.0181, + "reward": 1.2500000596046448, + "reward_std": 0.2511608935892582, + "rewards/accuracy_reward": 0.2767857275903225, + "rewards/format_reward": 0.9732143431901932, + "step": 1948 + }, + { + "completion_length": 827.8772735595703, + "epoch": 0.5821820625793444, + "grad_norm": 0.7858123779296875, + "kl": 0.41845703125, + "learning_rate": 4.994161007057465e-07, + "loss": 0.044, + "reward": 1.1227679252624512, + "reward_std": 0.24192952178418636, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9620536267757416, + "step": 1949 + }, + { + "completion_length": 803.9910888671875, + "epoch": 0.5824807706668658, + "grad_norm": 1.2597627639770508, + "kl": 0.49267578125, + "learning_rate": 4.989497416373687e-07, + "loss": 0.0454, + "reward": 1.131696492433548, + "reward_std": 0.21637624874711037, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.9464286118745804, + "step": 1950 + }, + { + "completion_length": 811.169677734375, + "epoch": 0.5827794787543873, + "grad_norm": 0.7169228196144104, + "kl": 0.35498046875, + "learning_rate": 4.984834381066687e-07, + "loss": 0.0066, + "reward": 1.2500000298023224, + "reward_std": 0.2646065652370453, + "rewards/accuracy_reward": 0.2879464402794838, + "rewards/format_reward": 0.9620536118745804, + "step": 1951 + }, + { + "completion_length": 789.7857513427734, + "epoch": 0.5830781868419087, + "grad_norm": 0.8088078498840332, + "kl": 0.281494140625, + "learning_rate": 4.980171906209386e-07, + "loss": 0.0222, + "reward": 1.2232143580913544, + "reward_std": 0.2406474817544222, + "rewards/accuracy_reward": 0.2477678693830967, + "rewards/format_reward": 0.9754464626312256, + "step": 1952 + }, + { + "completion_length": 853.8616485595703, + "epoch": 0.5833768949294302, + "grad_norm": 0.8072922825813293, + "kl": 0.22998046875, + "learning_rate": 4.975509996874106e-07, + "loss": 0.0055, + "reward": 1.113839328289032, + "reward_std": 0.26085977256298065, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.9642857313156128, + "step": 1953 + }, + { + "completion_length": 778.2522735595703, + "epoch": 0.5836756030169516, + "grad_norm": 0.5309399366378784, + "kl": 0.2283935546875, + "learning_rate": 4.970848658132541e-07, + "loss": 0.0269, + "reward": 1.238839328289032, + "reward_std": 0.212053332477808, + "rewards/accuracy_reward": 0.27008930034935474, + "rewards/format_reward": 0.9687500447034836, + "step": 1954 + }, + { + "completion_length": 808.4576263427734, + "epoch": 0.5839743111044732, + "grad_norm": 0.7942214012145996, + "kl": 0.343017578125, + "learning_rate": 4.966187895055776e-07, + "loss": 0.0407, + "reward": 1.1517857611179352, + "reward_std": 0.25685544684529305, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.957589328289032, + "step": 1955 + }, + { + "completion_length": 875.0446929931641, + "epoch": 0.5842730191919946, + "grad_norm": 1.5754643678665161, + "kl": 0.3779296875, + "learning_rate": 4.961527712714259e-07, + "loss": 0.0017, + "reward": 1.0870536267757416, + "reward_std": 0.2281561940908432, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9553571790456772, + "step": 1956 + }, + { + "completion_length": 810.1429138183594, + "epoch": 0.5845717272795161, + "grad_norm": 2.499127149581909, + "kl": 0.41064453125, + "learning_rate": 4.956868116177816e-07, + "loss": 0.0468, + "reward": 1.2410714626312256, + "reward_std": 0.22533666156232357, + "rewards/accuracy_reward": 0.2678571604192257, + "rewards/format_reward": 0.973214328289032, + "step": 1957 + }, + { + "completion_length": 773.1763763427734, + "epoch": 0.5848704353670375, + "grad_norm": 2.0492196083068848, + "kl": 0.28759765625, + "learning_rate": 4.95220911051563e-07, + "loss": 0.0017, + "reward": 1.1830357611179352, + "reward_std": 0.19221877306699753, + "rewards/accuracy_reward": 0.21205358440056443, + "rewards/format_reward": 0.9709821790456772, + "step": 1958 + }, + { + "completion_length": 856.9531555175781, + "epoch": 0.5851691434545591, + "grad_norm": 0.7329577803611755, + "kl": 0.4169921875, + "learning_rate": 4.947550700796242e-07, + "loss": 0.0245, + "reward": 1.1696428954601288, + "reward_std": 0.27092981338500977, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.96651791036129, + "step": 1959 + }, + { + "completion_length": 799.544677734375, + "epoch": 0.5854678515420805, + "grad_norm": 0.9043151140213013, + "kl": 0.386962890625, + "learning_rate": 4.942892892087546e-07, + "loss": 0.0223, + "reward": 1.084821492433548, + "reward_std": 0.19500546716153622, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.9709821939468384, + "step": 1960 + }, + { + "completion_length": 879.8750457763672, + "epoch": 0.585766559629602, + "grad_norm": 1.0202783346176147, + "kl": 0.51611328125, + "learning_rate": 4.938235689456782e-07, + "loss": 0.0067, + "reward": 1.1071428954601288, + "reward_std": 0.22963611781597137, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9531250447034836, + "step": 1961 + }, + { + "completion_length": 879.919677734375, + "epoch": 0.5860652677171234, + "grad_norm": 2.492277145385742, + "kl": 0.66845703125, + "learning_rate": 4.933579097970529e-07, + "loss": 0.0185, + "reward": 1.2165178954601288, + "reward_std": 0.2897607162594795, + "rewards/accuracy_reward": 0.267857164144516, + "rewards/format_reward": 0.9486607611179352, + "step": 1962 + }, + { + "completion_length": 855.1875305175781, + "epoch": 0.5863639758046449, + "grad_norm": 1.621219515800476, + "kl": 0.56787109375, + "learning_rate": 4.9289231226947e-07, + "loss": 0.03, + "reward": 1.0848214626312256, + "reward_std": 0.22954360395669937, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9531250447034836, + "step": 1963 + }, + { + "completion_length": 840.2120971679688, + "epoch": 0.5866626838921664, + "grad_norm": 0.7027738690376282, + "kl": 0.556640625, + "learning_rate": 4.924267768694544e-07, + "loss": 0.0628, + "reward": 1.0959822237491608, + "reward_std": 0.20396742597222328, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.9687500298023224, + "step": 1964 + }, + { + "completion_length": 836.6094055175781, + "epoch": 0.5869613919796879, + "grad_norm": 1.2603881359100342, + "kl": 0.66015625, + "learning_rate": 4.919613041034628e-07, + "loss": 0.0207, + "reward": 1.113839328289032, + "reward_std": 0.2761802263557911, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9665178954601288, + "step": 1965 + }, + { + "completion_length": 781.4844207763672, + "epoch": 0.5872601000672093, + "grad_norm": 0.9633646607398987, + "kl": 0.4697265625, + "learning_rate": 4.914958944778837e-07, + "loss": 0.0213, + "reward": 1.1808036267757416, + "reward_std": 0.27391699329018593, + "rewards/accuracy_reward": 0.227678582072258, + "rewards/format_reward": 0.9531250447034836, + "step": 1966 + }, + { + "completion_length": 793.1384124755859, + "epoch": 0.5875588081547308, + "grad_norm": 1.6631712913513184, + "kl": 0.43115234375, + "learning_rate": 4.910305484990377e-07, + "loss": 0.0313, + "reward": 1.2477678954601288, + "reward_std": 0.23210379481315613, + "rewards/accuracy_reward": 0.2767857238650322, + "rewards/format_reward": 0.9709821939468384, + "step": 1967 + }, + { + "completion_length": 850.9152221679688, + "epoch": 0.5878575162422522, + "grad_norm": 0.9824408292770386, + "kl": 0.37646484375, + "learning_rate": 4.905652666731751e-07, + "loss": 0.0119, + "reward": 1.0290179252624512, + "reward_std": 0.17749789729714394, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.9665178954601288, + "step": 1968 + }, + { + "completion_length": 935.2924499511719, + "epoch": 0.5881562243297738, + "grad_norm": 1.0595470666885376, + "kl": 0.59716796875, + "learning_rate": 4.901000495064774e-07, + "loss": 0.0519, + "reward": 1.0781250447034836, + "reward_std": 0.24114007875323296, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.9553571939468384, + "step": 1969 + }, + { + "completion_length": 836.0089569091797, + "epoch": 0.5884549324172952, + "grad_norm": 0.6892067193984985, + "kl": 0.372314453125, + "learning_rate": 4.896348975050546e-07, + "loss": 0.0132, + "reward": 1.0892857909202576, + "reward_std": 0.20126916095614433, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9620536118745804, + "step": 1970 + }, + { + "completion_length": 874.1830749511719, + "epoch": 0.5887536405048167, + "grad_norm": 1.1349952220916748, + "kl": 0.685546875, + "learning_rate": 4.891698111749471e-07, + "loss": 0.0272, + "reward": 1.0558036118745804, + "reward_std": 0.22897856310009956, + "rewards/accuracy_reward": 0.10937500419095159, + "rewards/format_reward": 0.9464286118745804, + "step": 1971 + }, + { + "completion_length": 829.1741485595703, + "epoch": 0.5890523485923381, + "grad_norm": 2.1406466960906982, + "kl": 0.43212890625, + "learning_rate": 4.88704791022123e-07, + "loss": 0.0114, + "reward": 1.2812500298023224, + "reward_std": 0.23569799959659576, + "rewards/accuracy_reward": 0.3281250186264515, + "rewards/format_reward": 0.9531250447034836, + "step": 1972 + }, + { + "completion_length": 900.2656707763672, + "epoch": 0.5893510566798597, + "grad_norm": 1.1814745664596558, + "kl": 0.669921875, + "learning_rate": 4.882398375524789e-07, + "loss": 0.005, + "reward": 1.1450893580913544, + "reward_std": 0.2565786764025688, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.964285746216774, + "step": 1973 + }, + { + "completion_length": 843.2678833007812, + "epoch": 0.5896497647673811, + "grad_norm": 1.3188457489013672, + "kl": 0.52197265625, + "learning_rate": 4.877749512718381e-07, + "loss": 0.0122, + "reward": 1.1093750596046448, + "reward_std": 0.24576054885983467, + "rewards/accuracy_reward": 0.14955358393490314, + "rewards/format_reward": 0.9598214477300644, + "step": 1974 + }, + { + "completion_length": 787.4620819091797, + "epoch": 0.5899484728549026, + "grad_norm": 0.9561645984649658, + "kl": 0.5859375, + "learning_rate": 4.873101326859517e-07, + "loss": 0.0191, + "reward": 1.2299107909202576, + "reward_std": 0.30010953918099403, + "rewards/accuracy_reward": 0.2723214440047741, + "rewards/format_reward": 0.957589328289032, + "step": 1975 + }, + { + "completion_length": 862.388427734375, + "epoch": 0.590247180942424, + "grad_norm": 3.0285356044769287, + "kl": 0.796875, + "learning_rate": 4.868453823004967e-07, + "loss": 0.0063, + "reward": 1.1517857909202576, + "reward_std": 0.23522105813026428, + "rewards/accuracy_reward": 0.1986607275903225, + "rewards/format_reward": 0.9531250447034836, + "step": 1976 + }, + { + "completion_length": 872.1540832519531, + "epoch": 0.5905458890299455, + "grad_norm": 1.6532886028289795, + "kl": 0.53271484375, + "learning_rate": 4.86380700621076e-07, + "loss": 0.0255, + "reward": 1.116071492433548, + "reward_std": 0.18703452497720718, + "rewards/accuracy_reward": 0.15848215110599995, + "rewards/format_reward": 0.957589328289032, + "step": 1977 + }, + { + "completion_length": 922.0736999511719, + "epoch": 0.590844597117467, + "grad_norm": 2.388432264328003, + "kl": 0.64794921875, + "learning_rate": 4.859160881532176e-07, + "loss": 0.0058, + "reward": 1.0959821790456772, + "reward_std": 0.20345653966069221, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.948660746216774, + "step": 1978 + }, + { + "completion_length": 889.4911193847656, + "epoch": 0.5911433052049885, + "grad_norm": 2.264756917953491, + "kl": 0.65771484375, + "learning_rate": 4.854515454023745e-07, + "loss": 0.0149, + "reward": 1.098214328289032, + "reward_std": 0.23814951814711094, + "rewards/accuracy_reward": 0.12723215040750802, + "rewards/format_reward": 0.9709821790456772, + "step": 1979 + }, + { + "completion_length": 870.9464721679688, + "epoch": 0.5914420132925099, + "grad_norm": 3.175175189971924, + "kl": 0.751953125, + "learning_rate": 4.849870728739234e-07, + "loss": -0.0004, + "reward": 1.0892857611179352, + "reward_std": 0.28113603591918945, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.9397321939468384, + "step": 1980 + }, + { + "completion_length": 833.247802734375, + "epoch": 0.5917407213800313, + "grad_norm": 1.6675584316253662, + "kl": 0.595703125, + "learning_rate": 4.845226710731651e-07, + "loss": 0.0024, + "reward": 1.1316964626312256, + "reward_std": 0.2436928041279316, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.948660746216774, + "step": 1981 + }, + { + "completion_length": 862.107177734375, + "epoch": 0.5920394294675528, + "grad_norm": 2.031165361404419, + "kl": 0.705078125, + "learning_rate": 4.840583405053233e-07, + "loss": 0.0291, + "reward": 1.1026786267757416, + "reward_std": 0.17659412696957588, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.9843750447034836, + "step": 1982 + }, + { + "completion_length": 793.7857513427734, + "epoch": 0.5923381375550743, + "grad_norm": 1.7503331899642944, + "kl": 0.5751953125, + "learning_rate": 4.83594081675544e-07, + "loss": -0.0073, + "reward": 1.1383928954601288, + "reward_std": 0.2634962126612663, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9553571939468384, + "step": 1983 + }, + { + "completion_length": 937.0134429931641, + "epoch": 0.5926368456425958, + "grad_norm": 1.6459509134292603, + "kl": 0.58544921875, + "learning_rate": 4.831298950888953e-07, + "loss": 0.0117, + "reward": 1.0937500596046448, + "reward_std": 0.24440046399831772, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.9598214626312256, + "step": 1984 + }, + { + "completion_length": 903.8638763427734, + "epoch": 0.5929355537301172, + "grad_norm": 0.8203364610671997, + "kl": 0.49609375, + "learning_rate": 4.826657812503668e-07, + "loss": -0.0076, + "reward": 1.270089328289032, + "reward_std": 0.30587076395750046, + "rewards/accuracy_reward": 0.3191964477300644, + "rewards/format_reward": 0.9508928805589676, + "step": 1985 + }, + { + "completion_length": 789.9174346923828, + "epoch": 0.5932342618176387, + "grad_norm": 1.5214643478393555, + "kl": 0.50341796875, + "learning_rate": 4.822017406648689e-07, + "loss": 0.0012, + "reward": 1.1227678954601288, + "reward_std": 0.24399186298251152, + "rewards/accuracy_reward": 0.16741072246804833, + "rewards/format_reward": 0.9553571790456772, + "step": 1986 + }, + { + "completion_length": 889.732177734375, + "epoch": 0.5935329699051601, + "grad_norm": 1.6470446586608887, + "kl": 0.61767578125, + "learning_rate": 4.817377738372321e-07, + "loss": -0.046, + "reward": 1.0758928954601288, + "reward_std": 0.2570034973323345, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9464286118745804, + "step": 1987 + }, + { + "completion_length": 800.154052734375, + "epoch": 0.5938316779926817, + "grad_norm": 0.7218371033668518, + "kl": 0.57568359375, + "learning_rate": 4.812738812722069e-07, + "loss": 0.0069, + "reward": 1.1741071939468384, + "reward_std": 0.2060352861881256, + "rewards/accuracy_reward": 0.1986607275903225, + "rewards/format_reward": 0.9754464775323868, + "step": 1988 + }, + { + "completion_length": 843.3504791259766, + "epoch": 0.5941303860802031, + "grad_norm": 1.1153857707977295, + "kl": 0.3984375, + "learning_rate": 4.808100634744632e-07, + "loss": 0.023, + "reward": 1.1785714626312256, + "reward_std": 0.22013043239712715, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.9687500447034836, + "step": 1989 + }, + { + "completion_length": 833.8058319091797, + "epoch": 0.5944290941677246, + "grad_norm": 1.0642085075378418, + "kl": 0.55126953125, + "learning_rate": 4.803463209485891e-07, + "loss": 0.0115, + "reward": 1.1986607611179352, + "reward_std": 0.23477336391806602, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.96651791036129, + "step": 1990 + }, + { + "completion_length": 938.4263610839844, + "epoch": 0.594727802255246, + "grad_norm": 1.175445556640625, + "kl": 0.5810546875, + "learning_rate": 4.798826541990908e-07, + "loss": 0.0309, + "reward": 1.0691965073347092, + "reward_std": 0.26163026690483093, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.9598214775323868, + "step": 1991 + }, + { + "completion_length": 793.6786041259766, + "epoch": 0.5950265103427675, + "grad_norm": 1.1118372678756714, + "kl": 0.46533203125, + "learning_rate": 4.794190637303926e-07, + "loss": 0.0127, + "reward": 1.2366071939468384, + "reward_std": 0.22658236883580685, + "rewards/accuracy_reward": 0.25223215389996767, + "rewards/format_reward": 0.9843750447034836, + "step": 1992 + }, + { + "completion_length": 911.5737152099609, + "epoch": 0.595325218430289, + "grad_norm": 0.5863997340202332, + "kl": 0.46826171875, + "learning_rate": 4.789555500468354e-07, + "loss": 0.0035, + "reward": 1.0245536118745804, + "reward_std": 0.2123095542192459, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.9620536118745804, + "step": 1993 + }, + { + "completion_length": 957.0870971679688, + "epoch": 0.5956239265178105, + "grad_norm": 2.095247268676758, + "kl": 0.3974609375, + "learning_rate": 4.784921136526767e-07, + "loss": 0.0369, + "reward": 1.064732164144516, + "reward_std": 0.2353230156004429, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.964285746216774, + "step": 1994 + }, + { + "completion_length": 954.9933319091797, + "epoch": 0.5959226346053319, + "grad_norm": 0.7935923337936401, + "kl": 0.62548828125, + "learning_rate": 4.780287550520896e-07, + "loss": 0.0105, + "reward": 1.116071492433548, + "reward_std": 0.2543617859482765, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9620536118745804, + "step": 1995 + }, + { + "completion_length": 925.6183471679688, + "epoch": 0.5962213426928534, + "grad_norm": 1.2943488359451294, + "kl": 0.412353515625, + "learning_rate": 4.775654747491633e-07, + "loss": 0.0252, + "reward": 1.1383928954601288, + "reward_std": 0.2237834595143795, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.9486607611179352, + "step": 1996 + }, + { + "completion_length": 853.3772583007812, + "epoch": 0.5965200507803748, + "grad_norm": 0.9247100949287415, + "kl": 0.41552734375, + "learning_rate": 4.771022732479011e-07, + "loss": -0.009, + "reward": 1.1852679252624512, + "reward_std": 0.17890140414237976, + "rewards/accuracy_reward": 0.20535714738070965, + "rewards/format_reward": 0.9799107313156128, + "step": 1997 + }, + { + "completion_length": 915.9308471679688, + "epoch": 0.5968187588678964, + "grad_norm": 1.165497064590454, + "kl": 0.486328125, + "learning_rate": 4.7663915105222075e-07, + "loss": 0.0031, + "reward": 1.1160714626312256, + "reward_std": 0.15274551697075367, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9687500447034836, + "step": 1998 + }, + { + "completion_length": 916.5223846435547, + "epoch": 0.5971174669554178, + "grad_norm": 0.7632347941398621, + "kl": 0.33203125, + "learning_rate": 4.76176108665954e-07, + "loss": 0.0153, + "reward": 1.0491072237491608, + "reward_std": 0.21183187142014503, + "rewards/accuracy_reward": 0.08482143469154835, + "rewards/format_reward": 0.964285746216774, + "step": 1999 + }, + { + "completion_length": 805.4955749511719, + "epoch": 0.5974161750429393, + "grad_norm": 0.9222873449325562, + "kl": 0.373046875, + "learning_rate": 4.7571314659284545e-07, + "loss": 0.0002, + "reward": 1.2165178954601288, + "reward_std": 0.24764298275113106, + "rewards/accuracy_reward": 0.24330358020961285, + "rewards/format_reward": 0.973214328289032, + "step": 2000 + }, + { + "completion_length": 846.4129791259766, + "epoch": 0.5977148831304607, + "grad_norm": 0.6132627129554749, + "kl": 0.344482421875, + "learning_rate": 4.7525026533655264e-07, + "loss": -0.0029, + "reward": 1.1093750298023224, + "reward_std": 0.1826299950480461, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.9799107760190964, + "step": 2001 + }, + { + "completion_length": 824.1518249511719, + "epoch": 0.5980135912179823, + "grad_norm": 1.8684029579162598, + "kl": 0.42724609375, + "learning_rate": 4.747874654006447e-07, + "loss": 0.0583, + "reward": 1.238839328289032, + "reward_std": 0.23137423768639565, + "rewards/accuracy_reward": 0.2700892947614193, + "rewards/format_reward": 0.9687500447034836, + "step": 2002 + }, + { + "completion_length": 858.9911193847656, + "epoch": 0.5983122993055037, + "grad_norm": 1.8075538873672485, + "kl": 0.429931640625, + "learning_rate": 4.7432474728860286e-07, + "loss": 0.0283, + "reward": 1.3214286267757416, + "reward_std": 0.2638433538377285, + "rewards/accuracy_reward": 0.3705357313156128, + "rewards/format_reward": 0.9508928954601288, + "step": 2003 + }, + { + "completion_length": 961.2589721679688, + "epoch": 0.5986110073930252, + "grad_norm": 0.7623814344406128, + "kl": 0.273681640625, + "learning_rate": 4.73862111503819e-07, + "loss": 0.0346, + "reward": 1.0848214626312256, + "reward_std": 0.22872832044959068, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9531250447034836, + "step": 2004 + }, + { + "completion_length": 806.8460235595703, + "epoch": 0.5989097154805466, + "grad_norm": 2.26440691947937, + "kl": 0.3720703125, + "learning_rate": 4.7339955854959545e-07, + "loss": 0.0305, + "reward": 1.2142857611179352, + "reward_std": 0.18349603563547134, + "rewards/accuracy_reward": 0.2388393022119999, + "rewards/format_reward": 0.9754464775323868, + "step": 2005 + }, + { + "completion_length": 876.8772735595703, + "epoch": 0.5992084235680681, + "grad_norm": 0.9315153956413269, + "kl": 0.367431640625, + "learning_rate": 4.729370889291446e-07, + "loss": -0.0017, + "reward": 1.1584821790456772, + "reward_std": 0.23446180298924446, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9598214775323868, + "step": 2006 + }, + { + "completion_length": 902.0692291259766, + "epoch": 0.5995071316555896, + "grad_norm": 1.6192340850830078, + "kl": 0.30029296875, + "learning_rate": 4.7247470314558814e-07, + "loss": 0.0519, + "reward": 1.0959821939468384, + "reward_std": 0.22262869402766228, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.973214328289032, + "step": 2007 + }, + { + "completion_length": 930.6294860839844, + "epoch": 0.5998058397431111, + "grad_norm": 0.9291579127311707, + "kl": 0.5673828125, + "learning_rate": 4.7201240170195624e-07, + "loss": 0.0138, + "reward": 1.069196492433548, + "reward_std": 0.26155635342001915, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.957589328289032, + "step": 2008 + }, + { + "completion_length": 889.2678985595703, + "epoch": 0.6001045478306325, + "grad_norm": 1.1438790559768677, + "kl": 0.603271484375, + "learning_rate": 4.715501851011877e-07, + "loss": 0.0054, + "reward": 1.1116071939468384, + "reward_std": 0.2825511284172535, + "rewards/accuracy_reward": 0.16517858020961285, + "rewards/format_reward": 0.9464286118745804, + "step": 2009 + }, + { + "completion_length": 892.4799499511719, + "epoch": 0.600403255918154, + "grad_norm": 2.050014019012451, + "kl": 0.72265625, + "learning_rate": 4.7108805384612884e-07, + "loss": 0.0454, + "reward": 1.1316964626312256, + "reward_std": 0.2648773491382599, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9375000596046448, + "step": 2010 + }, + { + "completion_length": 889.872802734375, + "epoch": 0.6007019640056754, + "grad_norm": 1.0089659690856934, + "kl": 0.3681640625, + "learning_rate": 4.706260084395333e-07, + "loss": 0.0347, + "reward": 1.1473214626312256, + "reward_std": 0.22056974843144417, + "rewards/accuracy_reward": 0.18080357951112092, + "rewards/format_reward": 0.96651791036129, + "step": 2011 + }, + { + "completion_length": 925.7522735595703, + "epoch": 0.601000672093197, + "grad_norm": 1.051178216934204, + "kl": 0.3857421875, + "learning_rate": 4.701640493840608e-07, + "loss": 0.0053, + "reward": 1.209821492433548, + "reward_std": 0.2415994256734848, + "rewards/accuracy_reward": 0.2433035857975483, + "rewards/format_reward": 0.9665178805589676, + "step": 2012 + }, + { + "completion_length": 804.6786193847656, + "epoch": 0.6012993801807184, + "grad_norm": 0.7954946160316467, + "kl": 0.505859375, + "learning_rate": 4.697021771822781e-07, + "loss": 0.0306, + "reward": 1.1763393580913544, + "reward_std": 0.25418198108673096, + "rewards/accuracy_reward": 0.20312500465661287, + "rewards/format_reward": 0.973214328289032, + "step": 2013 + }, + { + "completion_length": 870.2500305175781, + "epoch": 0.6015980882682399, + "grad_norm": 1.3127436637878418, + "kl": 0.48095703125, + "learning_rate": 4.6924039233665656e-07, + "loss": 0.0361, + "reward": 1.1428571939468384, + "reward_std": 0.24847139045596123, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.970982164144516, + "step": 2014 + }, + { + "completion_length": 829.9620971679688, + "epoch": 0.6018967963557613, + "grad_norm": 0.8841733336448669, + "kl": 0.436767578125, + "learning_rate": 4.687786953495728e-07, + "loss": 0.0135, + "reward": 1.1250000596046448, + "reward_std": 0.2081180065870285, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.973214328289032, + "step": 2015 + }, + { + "completion_length": 875.654052734375, + "epoch": 0.6021955044432828, + "grad_norm": 0.9418075680732727, + "kl": 0.6640625, + "learning_rate": 4.683170867233079e-07, + "loss": 0.0348, + "reward": 1.1361607909202576, + "reward_std": 0.2746497243642807, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.9553571790456772, + "step": 2016 + }, + { + "completion_length": 850.6339874267578, + "epoch": 0.6024942125308043, + "grad_norm": 1.0790538787841797, + "kl": 0.42138671875, + "learning_rate": 4.67855566960047e-07, + "loss": 0.0169, + "reward": 1.1517857909202576, + "reward_std": 0.263980507850647, + "rewards/accuracy_reward": 0.18080357648432255, + "rewards/format_reward": 0.9709821790456772, + "step": 2017 + }, + { + "completion_length": 874.5647735595703, + "epoch": 0.6027929206183258, + "grad_norm": 0.9736719727516174, + "kl": 0.6767578125, + "learning_rate": 4.673941365618781e-07, + "loss": 0.0169, + "reward": 1.1339286267757416, + "reward_std": 0.30190519243478775, + "rewards/accuracy_reward": 0.1875000037252903, + "rewards/format_reward": 0.9464286118745804, + "step": 2018 + }, + { + "completion_length": 954.3147888183594, + "epoch": 0.6030916287058472, + "grad_norm": 2.123206377029419, + "kl": 0.4755859375, + "learning_rate": 4.669327960307924e-07, + "loss": 0.0033, + "reward": 1.0959821939468384, + "reward_std": 0.2029307745397091, + "rewards/accuracy_reward": 0.13616071757860482, + "rewards/format_reward": 0.9598214626312256, + "step": 2019 + }, + { + "completion_length": 884.075927734375, + "epoch": 0.6033903367933687, + "grad_norm": 2.071270227432251, + "kl": 0.498046875, + "learning_rate": 4.6647154586868323e-07, + "loss": 0.0197, + "reward": 1.1875000596046448, + "reward_std": 0.21644052863121033, + "rewards/accuracy_reward": 0.2254464402794838, + "rewards/format_reward": 0.9620536118745804, + "step": 2020 + }, + { + "completion_length": 784.7344055175781, + "epoch": 0.6036890448808901, + "grad_norm": 1.641680359840393, + "kl": 0.6435546875, + "learning_rate": 4.660103865773455e-07, + "loss": 0.0149, + "reward": 1.2098215222358704, + "reward_std": 0.16185099072754383, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9776786267757416, + "step": 2021 + }, + { + "completion_length": 826.950927734375, + "epoch": 0.6039877529684117, + "grad_norm": 0.9980336427688599, + "kl": 0.583251953125, + "learning_rate": 4.6554931865847526e-07, + "loss": 0.0099, + "reward": 1.071428656578064, + "reward_std": 0.22804669849574566, + "rewards/accuracy_reward": 0.11383929010480642, + "rewards/format_reward": 0.9575893133878708, + "step": 2022 + }, + { + "completion_length": 842.9152221679688, + "epoch": 0.6042864610559331, + "grad_norm": 1.3878732919692993, + "kl": 0.52978515625, + "learning_rate": 4.6508834261366914e-07, + "loss": -0.0047, + "reward": 1.145089328289032, + "reward_std": 0.17720350716263056, + "rewards/accuracy_reward": 0.15848215040750802, + "rewards/format_reward": 0.9866071939468384, + "step": 2023 + }, + { + "completion_length": 830.7812957763672, + "epoch": 0.6045851691434545, + "grad_norm": 0.9786850810050964, + "kl": 0.43017578125, + "learning_rate": 4.646274589444241e-07, + "loss": 0.0151, + "reward": 1.0535714775323868, + "reward_std": 0.15318897739052773, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.9754464626312256, + "step": 2024 + }, + { + "completion_length": 886.7902374267578, + "epoch": 0.604883877230976, + "grad_norm": 0.9871642589569092, + "kl": 0.52099609375, + "learning_rate": 4.641666681521365e-07, + "loss": 0.0227, + "reward": 1.1250000596046448, + "reward_std": 0.2551121525466442, + "rewards/accuracy_reward": 0.16741072246804833, + "rewards/format_reward": 0.957589328289032, + "step": 2025 + }, + { + "completion_length": 911.4754943847656, + "epoch": 0.6051825853184974, + "grad_norm": 1.0673710107803345, + "kl": 0.402587890625, + "learning_rate": 4.6370597073810113e-07, + "loss": 0.0063, + "reward": 1.1540179550647736, + "reward_std": 0.22773733362555504, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.979910746216774, + "step": 2026 + }, + { + "completion_length": 868.3036041259766, + "epoch": 0.605481293406019, + "grad_norm": 1.3516514301300049, + "kl": 0.3671875, + "learning_rate": 4.6324536720351205e-07, + "loss": 0.0063, + "reward": 1.084821492433548, + "reward_std": 0.17403232119977474, + "rewards/accuracy_reward": 0.10714285913854837, + "rewards/format_reward": 0.9776786267757416, + "step": 2027 + }, + { + "completion_length": 803.0491485595703, + "epoch": 0.6057800014935404, + "grad_norm": 2.0403542518615723, + "kl": 0.421875, + "learning_rate": 4.6278485804946044e-07, + "loss": -0.0039, + "reward": 1.191964328289032, + "reward_std": 0.2402138113975525, + "rewards/accuracy_reward": 0.2187500037252903, + "rewards/format_reward": 0.973214328289032, + "step": 2028 + }, + { + "completion_length": 803.8214721679688, + "epoch": 0.6060787095810619, + "grad_norm": 1.0009124279022217, + "kl": 0.42578125, + "learning_rate": 4.6232444377693536e-07, + "loss": 0.018, + "reward": 1.160714328289032, + "reward_std": 0.19772258773446083, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.9821428954601288, + "step": 2029 + }, + { + "completion_length": 808.5893402099609, + "epoch": 0.6063774176685833, + "grad_norm": 0.6627427339553833, + "kl": 0.50537109375, + "learning_rate": 4.618641248868221e-07, + "loss": 0.0308, + "reward": 1.2008928954601288, + "reward_std": 0.2644376941025257, + "rewards/accuracy_reward": 0.23437500931322575, + "rewards/format_reward": 0.9665178805589676, + "step": 2030 + }, + { + "completion_length": 787.9799346923828, + "epoch": 0.6066761257561049, + "grad_norm": 1.2687926292419434, + "kl": 0.583984375, + "learning_rate": 4.6140390187990286e-07, + "loss": 0.0239, + "reward": 1.2790179252624512, + "reward_std": 0.25933773815631866, + "rewards/accuracy_reward": 0.3147321604192257, + "rewards/format_reward": 0.9642857611179352, + "step": 2031 + }, + { + "completion_length": 816.3973693847656, + "epoch": 0.6069748338436263, + "grad_norm": 1.0248498916625977, + "kl": 0.5810546875, + "learning_rate": 4.6094377525685515e-07, + "loss": -0.006, + "reward": 1.084821492433548, + "reward_std": 0.23312777653336525, + "rewards/accuracy_reward": 0.13169643888249993, + "rewards/format_reward": 0.9531250447034836, + "step": 2032 + }, + { + "completion_length": 931.982177734375, + "epoch": 0.6072735419311478, + "grad_norm": 1.8925193548202515, + "kl": 0.50927734375, + "learning_rate": 4.6048374551825143e-07, + "loss": 0.0172, + "reward": 1.1160714626312256, + "reward_std": 0.1480227056890726, + "rewards/accuracy_reward": 0.13392857508733869, + "rewards/format_reward": 0.9821428805589676, + "step": 2033 + }, + { + "completion_length": 980.9263763427734, + "epoch": 0.6075722500186692, + "grad_norm": 0.8909876942634583, + "kl": 0.426025390625, + "learning_rate": 4.600238131645592e-07, + "loss": 0.0379, + "reward": 1.098214328289032, + "reward_std": 0.18921851739287376, + "rewards/accuracy_reward": 0.11607143376022577, + "rewards/format_reward": 0.9821428954601288, + "step": 2034 + }, + { + "completion_length": 780.5245971679688, + "epoch": 0.6078709581061907, + "grad_norm": 1.7164790630340576, + "kl": 0.39794921875, + "learning_rate": 4.5956397869613937e-07, + "loss": 0.0261, + "reward": 1.1718750596046448, + "reward_std": 0.19834142364561558, + "rewards/accuracy_reward": 0.20089286752045155, + "rewards/format_reward": 0.9709821790456772, + "step": 2035 + }, + { + "completion_length": 859.5379943847656, + "epoch": 0.6081696661937122, + "grad_norm": 0.9186525940895081, + "kl": 0.345703125, + "learning_rate": 4.591042426132472e-07, + "loss": 0.0198, + "reward": 1.102678656578064, + "reward_std": 0.1697645941749215, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.9821428954601288, + "step": 2036 + }, + { + "completion_length": 810.8772735595703, + "epoch": 0.6084683742812337, + "grad_norm": 1.2654694318771362, + "kl": 0.49560546875, + "learning_rate": 4.5864460541603025e-07, + "loss": 0.0492, + "reward": 1.1361607611179352, + "reward_std": 0.2315182462334633, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9687500298023224, + "step": 2037 + }, + { + "completion_length": 856.7790374755859, + "epoch": 0.6087670823687551, + "grad_norm": 1.7017945051193237, + "kl": 0.297607421875, + "learning_rate": 4.581850676045288e-07, + "loss": 0.0244, + "reward": 1.1964286267757416, + "reward_std": 0.20801029726862907, + "rewards/accuracy_reward": 0.21428572572767735, + "rewards/format_reward": 0.9821428954601288, + "step": 2038 + }, + { + "completion_length": 843.8817291259766, + "epoch": 0.6090657904562766, + "grad_norm": 1.0989787578582764, + "kl": 0.331298828125, + "learning_rate": 4.577256296786751e-07, + "loss": 0.029, + "reward": 1.2008929252624512, + "reward_std": 0.21189764328300953, + "rewards/accuracy_reward": 0.2276785857975483, + "rewards/format_reward": 0.9732143133878708, + "step": 2039 + }, + { + "completion_length": 870.5156707763672, + "epoch": 0.609364498543798, + "grad_norm": 1.6842548847198486, + "kl": 0.50732421875, + "learning_rate": 4.572662921382924e-07, + "loss": 0.0454, + "reward": 1.1897322088479996, + "reward_std": 0.23604296147823334, + "rewards/accuracy_reward": 0.23660715040750802, + "rewards/format_reward": 0.9531250447034836, + "step": 2040 + }, + { + "completion_length": 844.1696929931641, + "epoch": 0.6096632066313196, + "grad_norm": 2.195713758468628, + "kl": 0.345703125, + "learning_rate": 4.5680705548309495e-07, + "loss": 0.0396, + "reward": 1.069196492433548, + "reward_std": 0.19055700302124023, + "rewards/accuracy_reward": 0.09375000186264515, + "rewards/format_reward": 0.9754464626312256, + "step": 2041 + }, + { + "completion_length": 857.4219055175781, + "epoch": 0.609961914718841, + "grad_norm": 1.4526479244232178, + "kl": 0.3984375, + "learning_rate": 4.5634792021268717e-07, + "loss": 0.0514, + "reward": 1.1919643580913544, + "reward_std": 0.2536325678229332, + "rewards/accuracy_reward": 0.2343750111758709, + "rewards/format_reward": 0.9575893133878708, + "step": 2042 + }, + { + "completion_length": 855.8170166015625, + "epoch": 0.6102606228063625, + "grad_norm": 0.8805209994316101, + "kl": 0.391845703125, + "learning_rate": 4.5588888682656336e-07, + "loss": 0.0091, + "reward": 1.0535714775323868, + "reward_std": 0.19339849427342415, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.9620536267757416, + "step": 2043 + }, + { + "completion_length": 906.9866333007812, + "epoch": 0.6105593308938839, + "grad_norm": 0.718916118144989, + "kl": 0.49853515625, + "learning_rate": 4.5542995582410693e-07, + "loss": 0.0234, + "reward": 1.1205357760190964, + "reward_std": 0.13176779076457024, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.973214328289032, + "step": 2044 + }, + { + "completion_length": 901.7254791259766, + "epoch": 0.6108580389814054, + "grad_norm": 1.3844130039215088, + "kl": 0.55419921875, + "learning_rate": 4.5497112770458944e-07, + "loss": 0.0316, + "reward": 1.129464328289032, + "reward_std": 0.23047509044408798, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.973214328289032, + "step": 2045 + }, + { + "completion_length": 937.8036193847656, + "epoch": 0.6111567470689269, + "grad_norm": 0.8667473793029785, + "kl": 0.46728515625, + "learning_rate": 4.5451240296717143e-07, + "loss": 0.0077, + "reward": 1.0848214626312256, + "reward_std": 0.2568565681576729, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9553571939468384, + "step": 2046 + }, + { + "completion_length": 801.3013916015625, + "epoch": 0.6114554551564484, + "grad_norm": 1.683669090270996, + "kl": 0.443359375, + "learning_rate": 4.5405378211090004e-07, + "loss": 0.0342, + "reward": 1.1383928954601288, + "reward_std": 0.22951382398605347, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9687500298023224, + "step": 2047 + }, + { + "completion_length": 855.5513916015625, + "epoch": 0.6117541632439698, + "grad_norm": 1.2461812496185303, + "kl": 0.5400390625, + "learning_rate": 4.5359526563471007e-07, + "loss": 0.0326, + "reward": 1.1383928954601288, + "reward_std": 0.28379545360803604, + "rewards/accuracy_reward": 0.18973215483129025, + "rewards/format_reward": 0.9486607611179352, + "step": 2048 + }, + { + "completion_length": 919.9531707763672, + "epoch": 0.6120528713314913, + "grad_norm": 1.6587048768997192, + "kl": 0.474609375, + "learning_rate": 4.531368540374223e-07, + "loss": -0.0295, + "reward": 1.0691964775323868, + "reward_std": 0.2123665250837803, + "rewards/accuracy_reward": 0.12276786426082253, + "rewards/format_reward": 0.9464286118745804, + "step": 2049 + }, + { + "completion_length": 883.3236999511719, + "epoch": 0.6123515794190127, + "grad_norm": 1.010706901550293, + "kl": 0.548828125, + "learning_rate": 4.5267854781774384e-07, + "loss": -0.0034, + "reward": 1.0803572088479996, + "reward_std": 0.18427322059869766, + "rewards/accuracy_reward": 0.12723215040750802, + "rewards/format_reward": 0.9531250298023224, + "step": 2050 + }, + { + "completion_length": 855.2120819091797, + "epoch": 0.6126502875065343, + "grad_norm": 1.0730204582214355, + "kl": 0.6162109375, + "learning_rate": 4.5222034747426687e-07, + "loss": 0.0279, + "reward": 1.194196492433548, + "reward_std": 0.2922382652759552, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.9553571939468384, + "step": 2051 + }, + { + "completion_length": 892.1205902099609, + "epoch": 0.6129489955940557, + "grad_norm": 1.9788877964019775, + "kl": 0.5390625, + "learning_rate": 4.517622535054684e-07, + "loss": 0.0268, + "reward": 1.1272321939468384, + "reward_std": 0.2253582775592804, + "rewards/accuracy_reward": 0.1741071455180645, + "rewards/format_reward": 0.9531250298023224, + "step": 2052 + }, + { + "completion_length": 848.3839569091797, + "epoch": 0.6132477036815772, + "grad_norm": 1.5966171026229858, + "kl": 0.4921875, + "learning_rate": 4.5130426640970967e-07, + "loss": 0.0298, + "reward": 1.1339286267757416, + "reward_std": 0.24512644112110138, + "rewards/accuracy_reward": 0.16741072107106447, + "rewards/format_reward": 0.9665178954601288, + "step": 2053 + }, + { + "completion_length": 799.5469055175781, + "epoch": 0.6135464117690986, + "grad_norm": 2.2718772888183594, + "kl": 0.63232421875, + "learning_rate": 4.508463866852358e-07, + "loss": 0.0094, + "reward": 1.0803572088479996, + "reward_std": 0.21034693345427513, + "rewards/accuracy_reward": 0.13392857694998384, + "rewards/format_reward": 0.9464286118745804, + "step": 2054 + }, + { + "completion_length": 932.1853179931641, + "epoch": 0.6138451198566202, + "grad_norm": 1.4305119514465332, + "kl": 0.6298828125, + "learning_rate": 4.503886148301753e-07, + "loss": 0.0494, + "reward": 1.0446429252624512, + "reward_std": 0.22274624928832054, + "rewards/accuracy_reward": 0.08928571688011289, + "rewards/format_reward": 0.9553571790456772, + "step": 2055 + }, + { + "completion_length": 912.6138916015625, + "epoch": 0.6141438279441416, + "grad_norm": 1.4946856498718262, + "kl": 0.677734375, + "learning_rate": 4.4993095134253876e-07, + "loss": 0.0217, + "reward": 1.0937500298023224, + "reward_std": 0.27163976430892944, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9486607611179352, + "step": 2056 + }, + { + "completion_length": 836.8437805175781, + "epoch": 0.6144425360316631, + "grad_norm": 1.2769663333892822, + "kl": 0.818359375, + "learning_rate": 4.4947339672021946e-07, + "loss": 0.0258, + "reward": 1.0424107611179352, + "reward_std": 0.25391751155257225, + "rewards/accuracy_reward": 0.11160715110599995, + "rewards/format_reward": 0.9308036118745804, + "step": 2057 + }, + { + "completion_length": 802.0625457763672, + "epoch": 0.6147412441191845, + "grad_norm": 1.9323652982711792, + "kl": 0.85546875, + "learning_rate": 4.490159514609918e-07, + "loss": 0.0427, + "reward": 1.178571492433548, + "reward_std": 0.297405444085598, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.948660746216774, + "step": 2058 + }, + { + "completion_length": 799.6674346923828, + "epoch": 0.615039952206706, + "grad_norm": 1.288392186164856, + "kl": 0.9267578125, + "learning_rate": 4.4855861606251156e-07, + "loss": 0.0445, + "reward": 1.1830357611179352, + "reward_std": 0.3391651287674904, + "rewards/accuracy_reward": 0.2455357275903225, + "rewards/format_reward": 0.9375000298023224, + "step": 2059 + }, + { + "completion_length": 803.732177734375, + "epoch": 0.6153386602942275, + "grad_norm": 2.4470372200012207, + "kl": 1.025390625, + "learning_rate": 4.4810139102231446e-07, + "loss": 0.0249, + "reward": 1.0513393133878708, + "reward_std": 0.24923541396856308, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9486607760190964, + "step": 2060 + }, + { + "completion_length": 885.1518249511719, + "epoch": 0.615637368381749, + "grad_norm": 3.2660880088806152, + "kl": 1.013671875, + "learning_rate": 4.47644276837817e-07, + "loss": 0.0303, + "reward": 1.0357143580913544, + "reward_std": 0.30870264023542404, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.9308036118745804, + "step": 2061 + }, + { + "completion_length": 878.3437805175781, + "epoch": 0.6159360764692704, + "grad_norm": 2.1260130405426025, + "kl": 0.830078125, + "learning_rate": 4.471872740063144e-07, + "loss": 0.0517, + "reward": 1.18526791036129, + "reward_std": 0.2754313126206398, + "rewards/accuracy_reward": 0.2321428619325161, + "rewards/format_reward": 0.9531250447034836, + "step": 2062 + }, + { + "completion_length": 866.6875457763672, + "epoch": 0.6162347845567919, + "grad_norm": 1.5321494340896606, + "kl": 0.77587890625, + "learning_rate": 4.467303830249808e-07, + "loss": 0.0174, + "reward": 1.1428572237491608, + "reward_std": 0.22011126950383186, + "rewards/accuracy_reward": 0.18973215017467737, + "rewards/format_reward": 0.9531250447034836, + "step": 2063 + }, + { + "completion_length": 791.3058319091797, + "epoch": 0.6165334926443133, + "grad_norm": 1.627376675605774, + "kl": 0.64306640625, + "learning_rate": 4.4627360439086905e-07, + "loss": 0.0187, + "reward": 1.0625000596046448, + "reward_std": 0.2963542528450489, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9308036118745804, + "step": 2064 + }, + { + "completion_length": 860.8192443847656, + "epoch": 0.6168322007318349, + "grad_norm": 1.6105902194976807, + "kl": 0.6044921875, + "learning_rate": 4.4581693860090954e-07, + "loss": 0.0183, + "reward": 1.2299107909202576, + "reward_std": 0.3075154311954975, + "rewards/accuracy_reward": 0.2767857238650322, + "rewards/format_reward": 0.9531250298023224, + "step": 2065 + }, + { + "completion_length": 934.7902221679688, + "epoch": 0.6171309088193563, + "grad_norm": 2.744340658187866, + "kl": 0.64111328125, + "learning_rate": 4.453603861519096e-07, + "loss": 0.0424, + "reward": 1.020089328289032, + "reward_std": 0.2642363104969263, + "rewards/accuracy_reward": 0.07812500558793545, + "rewards/format_reward": 0.941964328289032, + "step": 2066 + }, + { + "completion_length": 861.2187805175781, + "epoch": 0.6174296169068777, + "grad_norm": 1.5916484594345093, + "kl": 0.783203125, + "learning_rate": 4.449039475405538e-07, + "loss": -0.0195, + "reward": 1.0691964775323868, + "reward_std": 0.3613627329468727, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9285714775323868, + "step": 2067 + }, + { + "completion_length": 918.2678985595703, + "epoch": 0.6177283249943992, + "grad_norm": 2.9413843154907227, + "kl": 0.8515625, + "learning_rate": 4.444476232634027e-07, + "loss": -0.0283, + "reward": 1.07589291036129, + "reward_std": 0.232644684612751, + "rewards/accuracy_reward": 0.13392857951112092, + "rewards/format_reward": 0.941964328289032, + "step": 2068 + }, + { + "completion_length": 777.6138763427734, + "epoch": 0.6180270330819206, + "grad_norm": 1.2273242473602295, + "kl": 0.587890625, + "learning_rate": 4.439914138168922e-07, + "loss": 0.0257, + "reward": 1.1205357909202576, + "reward_std": 0.20790835842490196, + "rewards/accuracy_reward": 0.16517858020961285, + "rewards/format_reward": 0.9553571790456772, + "step": 2069 + }, + { + "completion_length": 819.7031555175781, + "epoch": 0.6183257411694422, + "grad_norm": 1.824122428894043, + "kl": 0.8544921875, + "learning_rate": 4.435353196973334e-07, + "loss": 0.0447, + "reward": 1.1026786267757416, + "reward_std": 0.23486138880252838, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.957589328289032, + "step": 2070 + }, + { + "completion_length": 875.1429138183594, + "epoch": 0.6186244492569636, + "grad_norm": 1.5753593444824219, + "kl": 0.64453125, + "learning_rate": 4.430793414009122e-07, + "loss": 0.0699, + "reward": 1.1183036118745804, + "reward_std": 0.19007983058691025, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9464285969734192, + "step": 2071 + }, + { + "completion_length": 802.6897735595703, + "epoch": 0.6189231573444851, + "grad_norm": 1.4468499422073364, + "kl": 0.68212890625, + "learning_rate": 4.4262347942368815e-07, + "loss": 0.0257, + "reward": 1.0580357909202576, + "reward_std": 0.1679418534040451, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.9642857611179352, + "step": 2072 + }, + { + "completion_length": 863.2433319091797, + "epoch": 0.6192218654320065, + "grad_norm": 2.1965537071228027, + "kl": 0.9072265625, + "learning_rate": 4.4216773426159446e-07, + "loss": 0.0046, + "reward": 1.0781250298023224, + "reward_std": 0.28998734056949615, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9508928954601288, + "step": 2073 + }, + { + "completion_length": 884.622802734375, + "epoch": 0.619520573519528, + "grad_norm": 2.7520952224731445, + "kl": 0.8671875, + "learning_rate": 4.417121064104372e-07, + "loss": 0.0198, + "reward": 1.053571492433548, + "reward_std": 0.2682507000863552, + "rewards/accuracy_reward": 0.11383928824216127, + "rewards/format_reward": 0.9397321939468384, + "step": 2074 + }, + { + "completion_length": 798.9464569091797, + "epoch": 0.6198192816070495, + "grad_norm": 1.7615305185317993, + "kl": 0.759765625, + "learning_rate": 4.4125659636589484e-07, + "loss": 0.0283, + "reward": 1.095982164144516, + "reward_std": 0.27966804802417755, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9397321790456772, + "step": 2075 + }, + { + "completion_length": 810.6495819091797, + "epoch": 0.620117989694571, + "grad_norm": 1.2898403406143188, + "kl": 0.59033203125, + "learning_rate": 4.408012046235177e-07, + "loss": 0.04, + "reward": 1.2477678954601288, + "reward_std": 0.29645831137895584, + "rewards/accuracy_reward": 0.3102678619325161, + "rewards/format_reward": 0.9375000298023224, + "step": 2076 + }, + { + "completion_length": 855.8348541259766, + "epoch": 0.6204166977820924, + "grad_norm": 1.418571949005127, + "kl": 0.7841796875, + "learning_rate": 4.4034593167872714e-07, + "loss": 0.0386, + "reward": 1.1651786267757416, + "reward_std": 0.25783319398760796, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.9553571790456772, + "step": 2077 + }, + { + "completion_length": 775.0335235595703, + "epoch": 0.6207154058696139, + "grad_norm": 1.9714293479919434, + "kl": 0.736328125, + "learning_rate": 4.3989077802681576e-07, + "loss": 0.0024, + "reward": 1.1026786267757416, + "reward_std": 0.19489339366555214, + "rewards/accuracy_reward": 0.13839286006987095, + "rewards/format_reward": 0.964285746216774, + "step": 2078 + }, + { + "completion_length": 944.3817291259766, + "epoch": 0.6210141139571353, + "grad_norm": 1.2888500690460205, + "kl": 0.6455078125, + "learning_rate": 4.3943574416294605e-07, + "loss": 0.0304, + "reward": 1.0758929252624512, + "reward_std": 0.2372187003493309, + "rewards/accuracy_reward": 0.11160714365541935, + "rewards/format_reward": 0.964285746216774, + "step": 2079 + }, + { + "completion_length": 874.4777221679688, + "epoch": 0.6213128220446569, + "grad_norm": 1.131554126739502, + "kl": 0.7080078125, + "learning_rate": 4.389808305821502e-07, + "loss": 0.0264, + "reward": 1.1584821939468384, + "reward_std": 0.3358299732208252, + "rewards/accuracy_reward": 0.2276785783469677, + "rewards/format_reward": 0.9308036118745804, + "step": 2080 + }, + { + "completion_length": 959.96435546875, + "epoch": 0.6216115301321783, + "grad_norm": 1.427390456199646, + "kl": 0.662109375, + "learning_rate": 4.385260377793295e-07, + "loss": 0.0446, + "reward": 1.0781250447034836, + "reward_std": 0.2839691527187824, + "rewards/accuracy_reward": 0.11830358020961285, + "rewards/format_reward": 0.9598214626312256, + "step": 2081 + }, + { + "completion_length": 910.7031555175781, + "epoch": 0.6219102382196998, + "grad_norm": 1.1579957008361816, + "kl": 0.564453125, + "learning_rate": 4.380713662492541e-07, + "loss": 0.036, + "reward": 1.1941964626312256, + "reward_std": 0.221238873898983, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9620536118745804, + "step": 2082 + }, + { + "completion_length": 904.2946929931641, + "epoch": 0.6222089463072212, + "grad_norm": 0.8928738236427307, + "kl": 0.58935546875, + "learning_rate": 4.376168164865622e-07, + "loss": 0.0218, + "reward": 1.1004464775323868, + "reward_std": 0.23688321560621262, + "rewards/accuracy_reward": 0.14285715040750802, + "rewards/format_reward": 0.957589328289032, + "step": 2083 + }, + { + "completion_length": 819.7857513427734, + "epoch": 0.6225076543947428, + "grad_norm": 1.945828914642334, + "kl": 0.5419921875, + "learning_rate": 4.3716238898575906e-07, + "loss": 0.0437, + "reward": 1.1428571790456772, + "reward_std": 0.224761750549078, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9553571790456772, + "step": 2084 + }, + { + "completion_length": 824.372802734375, + "epoch": 0.6228063624822642, + "grad_norm": 0.8767666220664978, + "kl": 0.48779296875, + "learning_rate": 4.3670808424121765e-07, + "loss": -0.0058, + "reward": 1.1674107909202576, + "reward_std": 0.2454911284148693, + "rewards/accuracy_reward": 0.2075893022119999, + "rewards/format_reward": 0.9598214626312256, + "step": 2085 + }, + { + "completion_length": 925.2723846435547, + "epoch": 0.6231050705697857, + "grad_norm": 1.534939169883728, + "kl": 0.60009765625, + "learning_rate": 4.362539027471767e-07, + "loss": 0.014, + "reward": 1.0937500149011612, + "reward_std": 0.3001435399055481, + "rewards/accuracy_reward": 0.14062500232830644, + "rewards/format_reward": 0.9531250298023224, + "step": 2086 + }, + { + "completion_length": 852.0223541259766, + "epoch": 0.6234037786573071, + "grad_norm": 0.9260286688804626, + "kl": 0.6826171875, + "learning_rate": 4.357998449977411e-07, + "loss": 0.0333, + "reward": 1.1718750596046448, + "reward_std": 0.3087809905409813, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.9486607611179352, + "step": 2087 + }, + { + "completion_length": 842.3214721679688, + "epoch": 0.6237024867448286, + "grad_norm": 1.2230266332626343, + "kl": 0.50048828125, + "learning_rate": 4.353459114868814e-07, + "loss": 0.0012, + "reward": 1.1250000596046448, + "reward_std": 0.24925906583666801, + "rewards/accuracy_reward": 0.17187500558793545, + "rewards/format_reward": 0.9531250447034836, + "step": 2088 + }, + { + "completion_length": 895.9420318603516, + "epoch": 0.6240011948323501, + "grad_norm": 2.8408899307250977, + "kl": 0.49658203125, + "learning_rate": 4.348921027084327e-07, + "loss": 0.0321, + "reward": 1.1205357313156128, + "reward_std": 0.2589542418718338, + "rewards/accuracy_reward": 0.1629464291036129, + "rewards/format_reward": 0.9575893133878708, + "step": 2089 + }, + { + "completion_length": 914.4866485595703, + "epoch": 0.6242999029198716, + "grad_norm": 1.6270480155944824, + "kl": 0.58837890625, + "learning_rate": 4.3443841915609457e-07, + "loss": 0.0245, + "reward": 1.0758929252624512, + "reward_std": 0.19031718373298645, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.96651791036129, + "step": 2090 + }, + { + "completion_length": 891.6272583007812, + "epoch": 0.624598611007393, + "grad_norm": 1.0296138525009155, + "kl": 0.52392578125, + "learning_rate": 4.339848613234299e-07, + "loss": -0.0257, + "reward": 1.0647322088479996, + "reward_std": 0.21209541335701942, + "rewards/accuracy_reward": 0.10714286402799189, + "rewards/format_reward": 0.957589328289032, + "step": 2091 + }, + { + "completion_length": 879.8928985595703, + "epoch": 0.6248973190949145, + "grad_norm": 0.9688863754272461, + "kl": 0.43603515625, + "learning_rate": 4.3353142970386557e-07, + "loss": 0.04, + "reward": 1.1718750298023224, + "reward_std": 0.2794105224311352, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.9665178954601288, + "step": 2092 + }, + { + "completion_length": 882.6652221679688, + "epoch": 0.6251960271824359, + "grad_norm": 2.2248692512512207, + "kl": 0.4658203125, + "learning_rate": 4.3307812479069063e-07, + "loss": 0.039, + "reward": 1.178571492433548, + "reward_std": 0.26739315688610077, + "rewards/accuracy_reward": 0.21651786752045155, + "rewards/format_reward": 0.9620536118745804, + "step": 2093 + }, + { + "completion_length": 909.8214874267578, + "epoch": 0.6254947352699575, + "grad_norm": 1.4111870527267456, + "kl": 0.60302734375, + "learning_rate": 4.326249470770563e-07, + "loss": 0.0177, + "reward": 1.145089328289032, + "reward_std": 0.2752758488059044, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.9553571939468384, + "step": 2094 + }, + { + "completion_length": 839.450927734375, + "epoch": 0.6257934433574789, + "grad_norm": 2.4745185375213623, + "kl": 0.37060546875, + "learning_rate": 4.3217189705597547e-07, + "loss": 0.0592, + "reward": 1.0848214775323868, + "reward_std": 0.2250109426677227, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.973214328289032, + "step": 2095 + }, + { + "completion_length": 955.3661193847656, + "epoch": 0.6260921514450004, + "grad_norm": 1.3813669681549072, + "kl": 0.5966796875, + "learning_rate": 4.317189752203224e-07, + "loss": 0.043, + "reward": 1.0781250298023224, + "reward_std": 0.2690932974219322, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9464286118745804, + "step": 2096 + }, + { + "completion_length": 922.5111999511719, + "epoch": 0.6263908595325218, + "grad_norm": 1.0477544069290161, + "kl": 0.48779296875, + "learning_rate": 4.3126618206283136e-07, + "loss": 0.0155, + "reward": 1.2276786267757416, + "reward_std": 0.2137899175286293, + "rewards/accuracy_reward": 0.2566964402794838, + "rewards/format_reward": 0.9709821790456772, + "step": 2097 + }, + { + "completion_length": 995.2098541259766, + "epoch": 0.6266895676200434, + "grad_norm": 0.7777249217033386, + "kl": 0.51025390625, + "learning_rate": 4.308135180760971e-07, + "loss": 0.0045, + "reward": 1.0758928954601288, + "reward_std": 0.2730249911546707, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.9553571790456772, + "step": 2098 + }, + { + "completion_length": 856.232177734375, + "epoch": 0.6269882757075648, + "grad_norm": 1.3243398666381836, + "kl": 0.6953125, + "learning_rate": 4.303609837525737e-07, + "loss": 0.0323, + "reward": 1.1272321790456772, + "reward_std": 0.2228815294802189, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.9687500298023224, + "step": 2099 + }, + { + "completion_length": 913.5893249511719, + "epoch": 0.6272869837950863, + "grad_norm": 1.2732727527618408, + "kl": 0.5654296875, + "learning_rate": 4.2990857958457407e-07, + "loss": 0.029, + "reward": 1.1964286416769028, + "reward_std": 0.24618449993431568, + "rewards/accuracy_reward": 0.2276785906869918, + "rewards/format_reward": 0.9687500298023224, + "step": 2100 + }, + { + "completion_length": 955.8750457763672, + "epoch": 0.6275856918826077, + "grad_norm": 1.369410514831543, + "kl": 0.67919921875, + "learning_rate": 4.2945630606426966e-07, + "loss": -0.014, + "reward": 1.0892857909202576, + "reward_std": 0.29397959262132645, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.9464286118745804, + "step": 2101 + }, + { + "completion_length": 906.3393249511719, + "epoch": 0.6278843999701292, + "grad_norm": 1.2346742153167725, + "kl": 0.71728515625, + "learning_rate": 4.2900416368368963e-07, + "loss": 0.0362, + "reward": 1.1607143580913544, + "reward_std": 0.2721154596656561, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.948660746216774, + "step": 2102 + }, + { + "completion_length": 794.6830749511719, + "epoch": 0.6281831080576507, + "grad_norm": 1.7515910863876343, + "kl": 0.6494140625, + "learning_rate": 4.285521529347207e-07, + "loss": 0.0294, + "reward": 1.1517857611179352, + "reward_std": 0.3037766329944134, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9531250447034836, + "step": 2103 + }, + { + "completion_length": 875.7902374267578, + "epoch": 0.6284818161451722, + "grad_norm": 1.0753915309906006, + "kl": 0.58203125, + "learning_rate": 4.281002743091062e-07, + "loss": 0.008, + "reward": 1.1250000596046448, + "reward_std": 0.23395810276269913, + "rewards/accuracy_reward": 0.1696428693830967, + "rewards/format_reward": 0.9553571939468384, + "step": 2104 + }, + { + "completion_length": 877.1808319091797, + "epoch": 0.6287805242326936, + "grad_norm": 0.8828577399253845, + "kl": 0.4677734375, + "learning_rate": 4.2764852829844566e-07, + "loss": 0.0398, + "reward": 1.1964285969734192, + "reward_std": 0.19142673537135124, + "rewards/accuracy_reward": 0.2254464328289032, + "rewards/format_reward": 0.9709821939468384, + "step": 2105 + }, + { + "completion_length": 905.1540374755859, + "epoch": 0.6290792323202151, + "grad_norm": 1.7710858583450317, + "kl": 0.5869140625, + "learning_rate": 4.271969153941948e-07, + "loss": 0.0114, + "reward": 1.1741072237491608, + "reward_std": 0.2078237682580948, + "rewards/accuracy_reward": 0.2053571566939354, + "rewards/format_reward": 0.9687500447034836, + "step": 2106 + }, + { + "completion_length": 802.1719360351562, + "epoch": 0.6293779404077365, + "grad_norm": 1.2643945217132568, + "kl": 0.8798828125, + "learning_rate": 4.267454360876639e-07, + "loss": 0.0432, + "reward": 1.1897321939468384, + "reward_std": 0.3443286269903183, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.9397321939468384, + "step": 2107 + }, + { + "completion_length": 875.4710235595703, + "epoch": 0.6296766484952581, + "grad_norm": 2.457350730895996, + "kl": 0.8759765625, + "learning_rate": 4.2629409087001835e-07, + "loss": 0.0236, + "reward": 1.1562500596046448, + "reward_std": 0.2827811501920223, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.957589328289032, + "step": 2108 + }, + { + "completion_length": 783.747802734375, + "epoch": 0.6299753565827795, + "grad_norm": 2.7752482891082764, + "kl": 0.822265625, + "learning_rate": 4.258428802322773e-07, + "loss": 0.0501, + "reward": 1.100446492433548, + "reward_std": 0.24782096967101097, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9486607611179352, + "step": 2109 + }, + { + "completion_length": 926.1674499511719, + "epoch": 0.6302740646703009, + "grad_norm": 1.1657272577285767, + "kl": 0.791015625, + "learning_rate": 4.25391804665314e-07, + "loss": 0.0078, + "reward": 0.99776791036129, + "reward_std": 0.21148014068603516, + "rewards/accuracy_reward": 0.04017857275903225, + "rewards/format_reward": 0.9575893133878708, + "step": 2110 + }, + { + "completion_length": 795.4866333007812, + "epoch": 0.6305727727578224, + "grad_norm": 3.605747938156128, + "kl": 0.9931640625, + "learning_rate": 4.2494086465985434e-07, + "loss": 0.0063, + "reward": 1.129464328289032, + "reward_std": 0.2430562861263752, + "rewards/accuracy_reward": 0.17410715483129025, + "rewards/format_reward": 0.9553571790456772, + "step": 2111 + }, + { + "completion_length": 830.0156555175781, + "epoch": 0.6308714808453438, + "grad_norm": 15.181953430175781, + "kl": 1.6181640625, + "learning_rate": 4.2449006070647663e-07, + "loss": 0.0717, + "reward": 1.0491071790456772, + "reward_std": 0.24809403344988823, + "rewards/accuracy_reward": 0.10937500791624188, + "rewards/format_reward": 0.9397321939468384, + "step": 2112 + }, + { + "completion_length": 873.6607513427734, + "epoch": 0.6311701889328654, + "grad_norm": 2.053746461868286, + "kl": 0.716796875, + "learning_rate": 4.240393932956117e-07, + "loss": 0.0172, + "reward": 1.147321492433548, + "reward_std": 0.26845424249768257, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9441964626312256, + "step": 2113 + }, + { + "completion_length": 887.279052734375, + "epoch": 0.6314688970203868, + "grad_norm": 1.399213194847107, + "kl": 1.0341796875, + "learning_rate": 4.2358886291754134e-07, + "loss": 0.0362, + "reward": 1.1495536267757416, + "reward_std": 0.25369637086987495, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9553571790456772, + "step": 2114 + }, + { + "completion_length": 888.7902069091797, + "epoch": 0.6317676051079083, + "grad_norm": 6.160454750061035, + "kl": 0.884765625, + "learning_rate": 4.2313847006239867e-07, + "loss": 0.0173, + "reward": 1.209821492433548, + "reward_std": 0.29108263924717903, + "rewards/accuracy_reward": 0.2544642984867096, + "rewards/format_reward": 0.9553571790456772, + "step": 2115 + }, + { + "completion_length": 820.1451263427734, + "epoch": 0.6320663131954297, + "grad_norm": 1.267495036125183, + "kl": 1.0068359375, + "learning_rate": 4.2268821522016665e-07, + "loss": 0.0433, + "reward": 1.2008928954601288, + "reward_std": 0.36863408982753754, + "rewards/accuracy_reward": 0.267857164144516, + "rewards/format_reward": 0.9330357611179352, + "step": 2116 + }, + { + "completion_length": 902.7344207763672, + "epoch": 0.6323650212829512, + "grad_norm": 1.655463457107544, + "kl": 0.8212890625, + "learning_rate": 4.222380988806786e-07, + "loss": -0.0061, + "reward": 1.0691964626312256, + "reward_std": 0.3530699834227562, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9084821939468384, + "step": 2117 + }, + { + "completion_length": 1021.4978485107422, + "epoch": 0.6326637293704727, + "grad_norm": 1.1450549364089966, + "kl": 0.5634765625, + "learning_rate": 4.2178812153361697e-07, + "loss": -0.0019, + "reward": 1.035714328289032, + "reward_std": 0.26826875284314156, + "rewards/accuracy_reward": 0.08482143189758062, + "rewards/format_reward": 0.9508928954601288, + "step": 2118 + }, + { + "completion_length": 931.3728179931641, + "epoch": 0.6329624374579942, + "grad_norm": 0.9389306306838989, + "kl": 0.6650390625, + "learning_rate": 4.21338283668513e-07, + "loss": 0.0154, + "reward": 1.098214328289032, + "reward_std": 0.26300113648176193, + "rewards/accuracy_reward": 0.13839286006987095, + "rewards/format_reward": 0.9598214626312256, + "step": 2119 + }, + { + "completion_length": 901.9710235595703, + "epoch": 0.6332611455455156, + "grad_norm": 1.3152470588684082, + "kl": 0.57275390625, + "learning_rate": 4.2088858577474616e-07, + "loss": 0.016, + "reward": 1.0379464477300644, + "reward_std": 0.22846092656254768, + "rewards/accuracy_reward": 0.08705357415601611, + "rewards/format_reward": 0.9508928954601288, + "step": 2120 + }, + { + "completion_length": 897.3058471679688, + "epoch": 0.6335598536330371, + "grad_norm": 1.3011442422866821, + "kl": 0.8349609375, + "learning_rate": 4.2043902834154374e-07, + "loss": 0.0271, + "reward": 1.0714286267757416, + "reward_std": 0.33677710592746735, + "rewards/accuracy_reward": 0.14732143888249993, + "rewards/format_reward": 0.9241071790456772, + "step": 2121 + }, + { + "completion_length": 942.3750457763672, + "epoch": 0.6338585617205585, + "grad_norm": 1.472259521484375, + "kl": 0.783203125, + "learning_rate": 4.199896118579802e-07, + "loss": 0.0202, + "reward": 1.1026785969734192, + "reward_std": 0.30853695422410965, + "rewards/accuracy_reward": 0.16741071827709675, + "rewards/format_reward": 0.93526791036129, + "step": 2122 + }, + { + "completion_length": 820.2857513427734, + "epoch": 0.6341572698080801, + "grad_norm": 1.6542870998382568, + "kl": 0.6845703125, + "learning_rate": 4.195403368129764e-07, + "loss": 0.0586, + "reward": 1.196428656578064, + "reward_std": 0.2899467870593071, + "rewards/accuracy_reward": 0.2455357201397419, + "rewards/format_reward": 0.9508928954601288, + "step": 2123 + }, + { + "completion_length": 835.4263916015625, + "epoch": 0.6344559778956015, + "grad_norm": 1.2151845693588257, + "kl": 0.58251953125, + "learning_rate": 4.190912036952999e-07, + "loss": 0.0544, + "reward": 1.1316964626312256, + "reward_std": 0.19009454920887947, + "rewards/accuracy_reward": 0.16964286379516125, + "rewards/format_reward": 0.9620536118745804, + "step": 2124 + }, + { + "completion_length": 919.6920013427734, + "epoch": 0.634754685983123, + "grad_norm": 0.9524555802345276, + "kl": 0.677734375, + "learning_rate": 4.1864221299356337e-07, + "loss": 0.0201, + "reward": 1.100446492433548, + "reward_std": 0.21709850057959557, + "rewards/accuracy_reward": 0.149553582072258, + "rewards/format_reward": 0.9508928954601288, + "step": 2125 + }, + { + "completion_length": 923.0937957763672, + "epoch": 0.6350533940706444, + "grad_norm": 1.100537657737732, + "kl": 0.60986328125, + "learning_rate": 4.181933651962245e-07, + "loss": 0.0532, + "reward": 1.100446492433548, + "reward_std": 0.2583924029022455, + "rewards/accuracy_reward": 0.15178572130389512, + "rewards/format_reward": 0.9486607611179352, + "step": 2126 + }, + { + "completion_length": 954.4866485595703, + "epoch": 0.635352102158166, + "grad_norm": 1.2507281303405762, + "kl": 0.73779296875, + "learning_rate": 4.177446607915859e-07, + "loss": 0.0329, + "reward": 1.1361607611179352, + "reward_std": 0.30600791424512863, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9375000298023224, + "step": 2127 + }, + { + "completion_length": 996.6138763427734, + "epoch": 0.6356508102456874, + "grad_norm": 0.7661289572715759, + "kl": 0.607421875, + "learning_rate": 4.1729610026779407e-07, + "loss": 0.0109, + "reward": 1.207589328289032, + "reward_std": 0.22884732484817505, + "rewards/accuracy_reward": 0.2500000223517418, + "rewards/format_reward": 0.9575893133878708, + "step": 2128 + }, + { + "completion_length": 902.6094207763672, + "epoch": 0.6359495183332089, + "grad_norm": 1.4945420026779175, + "kl": 0.88916015625, + "learning_rate": 4.1684768411283865e-07, + "loss": 0.03, + "reward": 1.0535714775323868, + "reward_std": 0.2752615250647068, + "rewards/accuracy_reward": 0.11830357508733869, + "rewards/format_reward": 0.93526791036129, + "step": 2129 + }, + { + "completion_length": 937.6986846923828, + "epoch": 0.6362482264207303, + "grad_norm": 1.1132690906524658, + "kl": 0.50341796875, + "learning_rate": 4.163994128145526e-07, + "loss": 0.003, + "reward": 1.1205357611179352, + "reward_std": 0.29570169001817703, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.941964328289032, + "step": 2130 + }, + { + "completion_length": 869.372802734375, + "epoch": 0.6365469345082518, + "grad_norm": 0.8771235942840576, + "kl": 0.5634765625, + "learning_rate": 4.159512868606112e-07, + "loss": 0.0153, + "reward": 1.1406250298023224, + "reward_std": 0.3002980127930641, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9419643133878708, + "step": 2131 + }, + { + "completion_length": 879.5111846923828, + "epoch": 0.6368456425957733, + "grad_norm": 0.7739313840866089, + "kl": 0.6015625, + "learning_rate": 4.155033067385314e-07, + "loss": -0.0122, + "reward": 1.1741071939468384, + "reward_std": 0.2989964000880718, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.9531250298023224, + "step": 2132 + }, + { + "completion_length": 817.5960235595703, + "epoch": 0.6371443506832948, + "grad_norm": 1.0549553632736206, + "kl": 0.521484375, + "learning_rate": 4.1505547293567177e-07, + "loss": -0.0005, + "reward": 1.225446492433548, + "reward_std": 0.29427750408649445, + "rewards/accuracy_reward": 0.2812500111758709, + "rewards/format_reward": 0.9441964626312256, + "step": 2133 + }, + { + "completion_length": 912.5960083007812, + "epoch": 0.6374430587708162, + "grad_norm": 1.4844204187393188, + "kl": 0.53125, + "learning_rate": 4.1460778593923173e-07, + "loss": 0.036, + "reward": 1.1741071939468384, + "reward_std": 0.2637329436838627, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.9375000447034836, + "step": 2134 + }, + { + "completion_length": 865.6027069091797, + "epoch": 0.6377417668583377, + "grad_norm": 2.087496757507324, + "kl": 0.62841796875, + "learning_rate": 4.1416024623625066e-07, + "loss": 0.0496, + "reward": 1.1540178954601288, + "reward_std": 0.33001749962568283, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.9330357760190964, + "step": 2135 + }, + { + "completion_length": 866.5045013427734, + "epoch": 0.6380404749458591, + "grad_norm": 0.8182637095451355, + "kl": 0.3984375, + "learning_rate": 4.13712854313608e-07, + "loss": -0.0189, + "reward": 1.1718750447034836, + "reward_std": 0.22870353236794472, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.96651791036129, + "step": 2136 + }, + { + "completion_length": 891.9687805175781, + "epoch": 0.6383391830333807, + "grad_norm": 1.7227811813354492, + "kl": 0.450927734375, + "learning_rate": 4.132656106580221e-07, + "loss": 0.0218, + "reward": 1.1629465222358704, + "reward_std": 0.29297712072730064, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.9464286118745804, + "step": 2137 + }, + { + "completion_length": 774.5937805175781, + "epoch": 0.6386378911209021, + "grad_norm": 1.1056374311447144, + "kl": 0.43310546875, + "learning_rate": 4.128185157560506e-07, + "loss": 0.0251, + "reward": 1.2991072237491608, + "reward_std": 0.28117430210113525, + "rewards/accuracy_reward": 0.3370535857975483, + "rewards/format_reward": 0.9620536267757416, + "step": 2138 + }, + { + "completion_length": 834.0870819091797, + "epoch": 0.6389365992084236, + "grad_norm": 1.548931360244751, + "kl": 0.5, + "learning_rate": 4.1237157009408864e-07, + "loss": -0.0096, + "reward": 1.0959821790456772, + "reward_std": 0.20086470246315002, + "rewards/accuracy_reward": 0.13392857275903225, + "rewards/format_reward": 0.9620536118745804, + "step": 2139 + }, + { + "completion_length": 879.9308471679688, + "epoch": 0.639235307295945, + "grad_norm": 1.5186760425567627, + "kl": 0.48583984375, + "learning_rate": 4.1192477415836944e-07, + "loss": 0.0368, + "reward": 1.1741071939468384, + "reward_std": 0.2941064015030861, + "rewards/accuracy_reward": 0.2187500149011612, + "rewards/format_reward": 0.9553571790456772, + "step": 2140 + }, + { + "completion_length": 868.8192443847656, + "epoch": 0.6395340153834665, + "grad_norm": 2.116560935974121, + "kl": 0.56640625, + "learning_rate": 4.114781284349631e-07, + "loss": 0.0424, + "reward": 1.1160714626312256, + "reward_std": 0.2295653335750103, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9464286118745804, + "step": 2141 + }, + { + "completion_length": 963.5893402099609, + "epoch": 0.639832723470988, + "grad_norm": 1.663643717765808, + "kl": 0.6630859375, + "learning_rate": 4.110316334097764e-07, + "loss": 0.0294, + "reward": 1.1674107611179352, + "reward_std": 0.2882666476070881, + "rewards/accuracy_reward": 0.23883929941803217, + "rewards/format_reward": 0.9285714775323868, + "step": 2142 + }, + { + "completion_length": 943.3348693847656, + "epoch": 0.6401314315585095, + "grad_norm": 1.2628151178359985, + "kl": 0.40673828125, + "learning_rate": 4.105852895685522e-07, + "loss": 0.0373, + "reward": 1.0178571939468384, + "reward_std": 0.18234828114509583, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.9598214626312256, + "step": 2143 + }, + { + "completion_length": 815.4844207763672, + "epoch": 0.6404301396460309, + "grad_norm": 2.172218084335327, + "kl": 0.396484375, + "learning_rate": 4.101390973968688e-07, + "loss": 0.0374, + "reward": 1.066964328289032, + "reward_std": 0.25100894272327423, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.9375000447034836, + "step": 2144 + }, + { + "completion_length": 931.4754943847656, + "epoch": 0.6407288477335524, + "grad_norm": 1.3899046182632446, + "kl": 0.619140625, + "learning_rate": 4.096930573801396e-07, + "loss": 0.0076, + "reward": 0.986607164144516, + "reward_std": 0.27914033830165863, + "rewards/accuracy_reward": 0.06250000279396772, + "rewards/format_reward": 0.9241071790456772, + "step": 2145 + }, + { + "completion_length": 841.013427734375, + "epoch": 0.6410275558210738, + "grad_norm": 1.01020085811615, + "kl": 0.5791015625, + "learning_rate": 4.0924717000361243e-07, + "loss": 0.005, + "reward": 1.1093750596046448, + "reward_std": 0.26690591499209404, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.9531250447034836, + "step": 2146 + }, + { + "completion_length": 845.4598693847656, + "epoch": 0.6413262639085954, + "grad_norm": 1.2705832719802856, + "kl": 0.68310546875, + "learning_rate": 4.0880143575236915e-07, + "loss": -0.0085, + "reward": 1.116071492433548, + "reward_std": 0.2670409046113491, + "rewards/accuracy_reward": 0.17187500558793545, + "rewards/format_reward": 0.9441964775323868, + "step": 2147 + }, + { + "completion_length": 929.8326110839844, + "epoch": 0.6416249719961168, + "grad_norm": 1.5272939205169678, + "kl": 0.603515625, + "learning_rate": 4.083558551113245e-07, + "loss": -0.002, + "reward": 1.0580357611179352, + "reward_std": 0.2237013354897499, + "rewards/accuracy_reward": 0.09598214831203222, + "rewards/format_reward": 0.9620536118745804, + "step": 2148 + }, + { + "completion_length": 911.8772735595703, + "epoch": 0.6419236800836383, + "grad_norm": 1.560722827911377, + "kl": 0.61865234375, + "learning_rate": 4.0791042856522717e-07, + "loss": 0.0198, + "reward": 1.178571492433548, + "reward_std": 0.3163914494216442, + "rewards/accuracy_reward": 0.2299107275903225, + "rewards/format_reward": 0.948660746216774, + "step": 2149 + }, + { + "completion_length": 873.3549499511719, + "epoch": 0.6422223881711597, + "grad_norm": 1.1499122381210327, + "kl": 0.7294921875, + "learning_rate": 4.074651565986572e-07, + "loss": 0.0124, + "reward": 1.1093750298023224, + "reward_std": 0.26264226622879505, + "rewards/accuracy_reward": 0.16517858067527413, + "rewards/format_reward": 0.9441964775323868, + "step": 2150 + }, + { + "completion_length": 849.3750457763672, + "epoch": 0.6425210962586813, + "grad_norm": 1.0029292106628418, + "kl": 0.65869140625, + "learning_rate": 4.070200396960269e-07, + "loss": -0.0202, + "reward": 1.0178571939468384, + "reward_std": 0.2542550601065159, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.9352679252624512, + "step": 2151 + }, + { + "completion_length": 901.5312957763672, + "epoch": 0.6428198043462027, + "grad_norm": 2.607665777206421, + "kl": 0.8330078125, + "learning_rate": 4.0657507834158e-07, + "loss": 0.0158, + "reward": 1.1205357611179352, + "reward_std": 0.2860527075827122, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.9375000447034836, + "step": 2152 + }, + { + "completion_length": 846.9063110351562, + "epoch": 0.6431185124337241, + "grad_norm": 1.2970296144485474, + "kl": 0.7177734375, + "learning_rate": 4.0613027301939063e-07, + "loss": 0.0567, + "reward": 1.089285746216774, + "reward_std": 0.26729127019643784, + "rewards/accuracy_reward": 0.149553582072258, + "rewards/format_reward": 0.9397321939468384, + "step": 2153 + }, + { + "completion_length": 955.5826416015625, + "epoch": 0.6434172205212456, + "grad_norm": 2.3544018268585205, + "kl": 1.109375, + "learning_rate": 4.056856242133634e-07, + "loss": 0.0506, + "reward": 1.0937500298023224, + "reward_std": 0.29617900028824806, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9263393133878708, + "step": 2154 + }, + { + "completion_length": 868.9911041259766, + "epoch": 0.643715928608767, + "grad_norm": 1.0916680097579956, + "kl": 0.90380859375, + "learning_rate": 4.0524113240723266e-07, + "loss": 0.057, + "reward": 1.1473214626312256, + "reward_std": 0.2743499372154474, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9620536118745804, + "step": 2155 + }, + { + "completion_length": 893.4174499511719, + "epoch": 0.6440146366962886, + "grad_norm": 1.2546813488006592, + "kl": 0.54736328125, + "learning_rate": 4.047967980845621e-07, + "loss": 0.0, + "reward": 1.196428656578064, + "reward_std": 0.2519974000751972, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.9575893133878708, + "step": 2156 + }, + { + "completion_length": 895.1563110351562, + "epoch": 0.64431334478381, + "grad_norm": 1.4603043794631958, + "kl": 0.76171875, + "learning_rate": 4.0435262172874376e-07, + "loss": -0.0156, + "reward": 1.0178571790456772, + "reward_std": 0.293575219810009, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.9218750596046448, + "step": 2157 + }, + { + "completion_length": 825.9643402099609, + "epoch": 0.6446120528713315, + "grad_norm": 1.1285815238952637, + "kl": 0.5322265625, + "learning_rate": 4.0390860382299795e-07, + "loss": 0.0356, + "reward": 1.004464328289032, + "reward_std": 0.20257655903697014, + "rewards/accuracy_reward": 0.0669642889406532, + "rewards/format_reward": 0.9375000447034836, + "step": 2158 + }, + { + "completion_length": 832.2678985595703, + "epoch": 0.6449107609588529, + "grad_norm": 1.2395503520965576, + "kl": 0.50634765625, + "learning_rate": 4.0346474485037274e-07, + "loss": 0.0301, + "reward": 1.1830357611179352, + "reward_std": 0.2053123079240322, + "rewards/accuracy_reward": 0.2187500037252903, + "rewards/format_reward": 0.9642857611179352, + "step": 2159 + }, + { + "completion_length": 844.3236999511719, + "epoch": 0.6452094690463744, + "grad_norm": 3.6225616931915283, + "kl": 0.7978515625, + "learning_rate": 4.0302104529374314e-07, + "loss": 0.0491, + "reward": 1.1250000596046448, + "reward_std": 0.3014411926269531, + "rewards/accuracy_reward": 0.1808035857975483, + "rewards/format_reward": 0.9441964626312256, + "step": 2160 + }, + { + "completion_length": 877.8013763427734, + "epoch": 0.6455081771338959, + "grad_norm": 1.6510776281356812, + "kl": 0.578125, + "learning_rate": 4.025775056358107e-07, + "loss": 0.0058, + "reward": 1.1294643580913544, + "reward_std": 0.30031709745526314, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9308036267757416, + "step": 2161 + }, + { + "completion_length": 876.4955749511719, + "epoch": 0.6458068852214174, + "grad_norm": 0.7933779358863831, + "kl": 0.4013671875, + "learning_rate": 4.0213412635910316e-07, + "loss": -0.0078, + "reward": 1.0870536267757416, + "reward_std": 0.21324806660413742, + "rewards/accuracy_reward": 0.12723214784637094, + "rewards/format_reward": 0.9598214775323868, + "step": 2162 + }, + { + "completion_length": 889.7924499511719, + "epoch": 0.6461055933089388, + "grad_norm": 1.008821725845337, + "kl": 0.5087890625, + "learning_rate": 4.016909079459738e-07, + "loss": 0.0125, + "reward": 1.082589328289032, + "reward_std": 0.2687847837805748, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9352678954601288, + "step": 2163 + }, + { + "completion_length": 897.0513763427734, + "epoch": 0.6464043013964603, + "grad_norm": 0.7107560634613037, + "kl": 0.3759765625, + "learning_rate": 4.012478508786008e-07, + "loss": -0.0139, + "reward": 1.0714286118745804, + "reward_std": 0.1808621920645237, + "rewards/accuracy_reward": 0.10044643585570157, + "rewards/format_reward": 0.9709821939468384, + "step": 2164 + }, + { + "completion_length": 921.2768096923828, + "epoch": 0.6467030094839817, + "grad_norm": 3.003718137741089, + "kl": 0.54736328125, + "learning_rate": 4.0080495563898664e-07, + "loss": 0.0518, + "reward": 1.0468750298023224, + "reward_std": 0.24494927376508713, + "rewards/accuracy_reward": 0.09821429313160479, + "rewards/format_reward": 0.9486607611179352, + "step": 2165 + }, + { + "completion_length": 924.5000305175781, + "epoch": 0.6470017175715033, + "grad_norm": 1.144073486328125, + "kl": 0.4482421875, + "learning_rate": 4.0036222270895803e-07, + "loss": 0.0088, + "reward": 1.102678656578064, + "reward_std": 0.2837497889995575, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.9419643133878708, + "step": 2166 + }, + { + "completion_length": 844.8973693847656, + "epoch": 0.6473004256590247, + "grad_norm": 0.7215293049812317, + "kl": 0.41650390625, + "learning_rate": 3.9991965257016525e-07, + "loss": 0.0217, + "reward": 1.2232143580913544, + "reward_std": 0.2282499484717846, + "rewards/accuracy_reward": 0.2767857313156128, + "rewards/format_reward": 0.9464286118745804, + "step": 2167 + }, + { + "completion_length": 781.935302734375, + "epoch": 0.6475991337465462, + "grad_norm": 1.370062232017517, + "kl": 0.46484375, + "learning_rate": 3.9947724570408093e-07, + "loss": 0.0068, + "reward": 1.1339286267757416, + "reward_std": 0.2710443548858166, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.948660746216774, + "step": 2168 + }, + { + "completion_length": 772.2701263427734, + "epoch": 0.6478978418340676, + "grad_norm": 0.860207200050354, + "kl": 0.353515625, + "learning_rate": 3.990350025920003e-07, + "loss": 0.0308, + "reward": 1.1406250596046448, + "reward_std": 0.20463163778185844, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.973214328289032, + "step": 2169 + }, + { + "completion_length": 860.3482513427734, + "epoch": 0.6481965499215891, + "grad_norm": 1.193199872970581, + "kl": 0.3916015625, + "learning_rate": 3.9859292371504085e-07, + "loss": 0.0153, + "reward": 1.0647322088479996, + "reward_std": 0.20471173711121082, + "rewards/accuracy_reward": 0.10491071874275804, + "rewards/format_reward": 0.9598214775323868, + "step": 2170 + }, + { + "completion_length": 839.5424346923828, + "epoch": 0.6484952580091106, + "grad_norm": 0.6073041558265686, + "kl": 0.406982421875, + "learning_rate": 3.981510095541408e-07, + "loss": 0.0124, + "reward": 1.0959821939468384, + "reward_std": 0.24134360998868942, + "rewards/accuracy_reward": 0.14732143026776612, + "rewards/format_reward": 0.9486607611179352, + "step": 2171 + }, + { + "completion_length": 827.0848541259766, + "epoch": 0.6487939660966321, + "grad_norm": 1.0499686002731323, + "kl": 0.3818359375, + "learning_rate": 3.977092605900596e-07, + "loss": -0.0014, + "reward": 1.0870536267757416, + "reward_std": 0.23749108612537384, + "rewards/accuracy_reward": 0.13169643329456449, + "rewards/format_reward": 0.9553571939468384, + "step": 2172 + }, + { + "completion_length": 848.6161041259766, + "epoch": 0.6490926741841535, + "grad_norm": 1.0562007427215576, + "kl": 0.4921875, + "learning_rate": 3.9726767730337687e-07, + "loss": -0.0125, + "reward": 1.1071428954601288, + "reward_std": 0.3218330815434456, + "rewards/accuracy_reward": 0.16294643841683865, + "rewards/format_reward": 0.9441964775323868, + "step": 2173 + }, + { + "completion_length": 919.1964721679688, + "epoch": 0.649391382271675, + "grad_norm": 0.715873658657074, + "kl": 0.35693359375, + "learning_rate": 3.968262601744917e-07, + "loss": 0.0109, + "reward": 1.01339291036129, + "reward_std": 0.2489210031926632, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.9308036118745804, + "step": 2174 + }, + { + "completion_length": 866.5781860351562, + "epoch": 0.6496900903591964, + "grad_norm": 0.8334385752677917, + "kl": 0.29248046875, + "learning_rate": 3.96385009683623e-07, + "loss": 0.0176, + "reward": 1.238839328289032, + "reward_std": 0.3185366988182068, + "rewards/accuracy_reward": 0.290178582072258, + "rewards/format_reward": 0.9486607611179352, + "step": 2175 + }, + { + "completion_length": 902.7634124755859, + "epoch": 0.649988798446718, + "grad_norm": 0.5196169018745422, + "kl": 0.306884765625, + "learning_rate": 3.9594392631080766e-07, + "loss": -0.0062, + "reward": 1.0267857611179352, + "reward_std": 0.2511092871427536, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.9375000447034836, + "step": 2176 + }, + { + "completion_length": 936.4442596435547, + "epoch": 0.6502875065342394, + "grad_norm": 1.1627306938171387, + "kl": 0.361572265625, + "learning_rate": 3.9550301053590163e-07, + "loss": 0.0271, + "reward": 1.042410746216774, + "reward_std": 0.24195818603038788, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.9642857611179352, + "step": 2177 + }, + { + "completion_length": 941.8326263427734, + "epoch": 0.6505862146217609, + "grad_norm": 1.2672735452651978, + "kl": 0.440185546875, + "learning_rate": 3.950622628385777e-07, + "loss": 0.0356, + "reward": 1.082589328289032, + "reward_std": 0.24716775864362717, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.964285746216774, + "step": 2178 + }, + { + "completion_length": 798.6741333007812, + "epoch": 0.6508849227092823, + "grad_norm": 0.992817223072052, + "kl": 0.2177734375, + "learning_rate": 3.9462168369832614e-07, + "loss": 0.0376, + "reward": 1.1808036267757416, + "reward_std": 0.2356518991291523, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.9732143431901932, + "step": 2179 + }, + { + "completion_length": 860.9576263427734, + "epoch": 0.6511836307968039, + "grad_norm": 0.7754021286964417, + "kl": 0.371826171875, + "learning_rate": 3.941812735944542e-07, + "loss": 0.0288, + "reward": 1.142857164144516, + "reward_std": 0.25925134867429733, + "rewards/accuracy_reward": 0.1763392984867096, + "rewards/format_reward": 0.9665178954601288, + "step": 2180 + }, + { + "completion_length": 928.5536193847656, + "epoch": 0.6514823388843253, + "grad_norm": 0.8439709544181824, + "kl": 0.3447265625, + "learning_rate": 3.9374103300608463e-07, + "loss": 0.008, + "reward": 1.1116071939468384, + "reward_std": 0.2263854295015335, + "rewards/accuracy_reward": 0.15401786379516125, + "rewards/format_reward": 0.9575893133878708, + "step": 2181 + }, + { + "completion_length": 821.9732666015625, + "epoch": 0.6517810469718468, + "grad_norm": 0.5954640507698059, + "kl": 0.283447265625, + "learning_rate": 3.933009624121562e-07, + "loss": 0.0074, + "reward": 1.223214328289032, + "reward_std": 0.24153369665145874, + "rewards/accuracy_reward": 0.2455357275903225, + "rewards/format_reward": 0.9776785969734192, + "step": 2182 + }, + { + "completion_length": 828.6004791259766, + "epoch": 0.6520797550593682, + "grad_norm": 1.3042945861816406, + "kl": 0.40771484375, + "learning_rate": 3.9286106229142224e-07, + "loss": 0.0376, + "reward": 1.0959821939468384, + "reward_std": 0.20976392552256584, + "rewards/accuracy_reward": 0.13839286682195961, + "rewards/format_reward": 0.957589328289032, + "step": 2183 + }, + { + "completion_length": 904.3147735595703, + "epoch": 0.6523784631468897, + "grad_norm": 0.9636551141738892, + "kl": 0.31103515625, + "learning_rate": 3.924213331224515e-07, + "loss": 0.0003, + "reward": 1.1049107611179352, + "reward_std": 0.14980854839086533, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.9776786118745804, + "step": 2184 + }, + { + "completion_length": 792.3862152099609, + "epoch": 0.6526771712344112, + "grad_norm": 1.5397543907165527, + "kl": 0.370849609375, + "learning_rate": 3.9198177538362585e-07, + "loss": 0.0396, + "reward": 1.0915178954601288, + "reward_std": 0.22270449250936508, + "rewards/accuracy_reward": 0.12276786658912897, + "rewards/format_reward": 0.9687500447034836, + "step": 2185 + }, + { + "completion_length": 821.4085235595703, + "epoch": 0.6529758793219327, + "grad_norm": 1.2460078001022339, + "kl": 0.31494140625, + "learning_rate": 3.915423895531411e-07, + "loss": 0.0261, + "reward": 1.1696428954601288, + "reward_std": 0.24142014235258102, + "rewards/accuracy_reward": 0.2053571492433548, + "rewards/format_reward": 0.964285746216774, + "step": 2186 + }, + { + "completion_length": 846.2210083007812, + "epoch": 0.6532745874094541, + "grad_norm": 1.0596303939819336, + "kl": 0.44873046875, + "learning_rate": 3.9110317610900613e-07, + "loss": 0.0245, + "reward": 1.1406250596046448, + "reward_std": 0.20945923775434494, + "rewards/accuracy_reward": 0.18750001303851604, + "rewards/format_reward": 0.9531250447034836, + "step": 2187 + }, + { + "completion_length": 810.0513763427734, + "epoch": 0.6535732954969756, + "grad_norm": 2.269152879714966, + "kl": 0.41015625, + "learning_rate": 3.90664135529042e-07, + "loss": 0.0435, + "reward": 1.1986607611179352, + "reward_std": 0.28846265748143196, + "rewards/accuracy_reward": 0.2410714440047741, + "rewards/format_reward": 0.9575893431901932, + "step": 2188 + }, + { + "completion_length": 807.3393249511719, + "epoch": 0.653872003584497, + "grad_norm": 1.2627413272857666, + "kl": 0.5048828125, + "learning_rate": 3.9022526829088176e-07, + "loss": 0.0167, + "reward": 1.087053656578064, + "reward_std": 0.2301219031214714, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.9531250447034836, + "step": 2189 + }, + { + "completion_length": 783.763427734375, + "epoch": 0.6541707116720186, + "grad_norm": 1.118760347366333, + "kl": 0.5908203125, + "learning_rate": 3.8978657487196987e-07, + "loss": 0.0106, + "reward": 1.0625000447034836, + "reward_std": 0.23487654700875282, + "rewards/accuracy_reward": 0.11383929196745157, + "rewards/format_reward": 0.9486607611179352, + "step": 2190 + }, + { + "completion_length": 900.841552734375, + "epoch": 0.65446941975954, + "grad_norm": 0.8280510306358337, + "kl": 0.6796875, + "learning_rate": 3.893480557495621e-07, + "loss": 0.0374, + "reward": 1.0424107611179352, + "reward_std": 0.19699178636074066, + "rewards/accuracy_reward": 0.07366071920841932, + "rewards/format_reward": 0.9687500298023224, + "step": 2191 + }, + { + "completion_length": 870.8326263427734, + "epoch": 0.6547681278470615, + "grad_norm": 1.0068539381027222, + "kl": 0.7060546875, + "learning_rate": 3.8890971140072405e-07, + "loss": -0.015, + "reward": 1.1517857611179352, + "reward_std": 0.2904214449226856, + "rewards/accuracy_reward": 0.2008928693830967, + "rewards/format_reward": 0.9508928954601288, + "step": 2192 + }, + { + "completion_length": 808.5067291259766, + "epoch": 0.6550668359345829, + "grad_norm": 1.8234450817108154, + "kl": 0.7080078125, + "learning_rate": 3.884715423023314e-07, + "loss": 0.0136, + "reward": 1.1450893580913544, + "reward_std": 0.2735265903174877, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9620535969734192, + "step": 2193 + }, + { + "completion_length": 810.2366485595703, + "epoch": 0.6553655440221045, + "grad_norm": 1.1985753774642944, + "kl": 0.57861328125, + "learning_rate": 3.8803354893106933e-07, + "loss": 0.0211, + "reward": 1.223214328289032, + "reward_std": 0.2583621237426996, + "rewards/accuracy_reward": 0.2633928693830967, + "rewards/format_reward": 0.9598214626312256, + "step": 2194 + }, + { + "completion_length": 796.3839569091797, + "epoch": 0.6556642521096259, + "grad_norm": 1.056792974472046, + "kl": 0.7177734375, + "learning_rate": 3.875957317634315e-07, + "loss": 0.0084, + "reward": 1.0781250447034836, + "reward_std": 0.23595765605568886, + "rewards/accuracy_reward": 0.12500000465661287, + "rewards/format_reward": 0.9531250447034836, + "step": 2195 + }, + { + "completion_length": 850.5937805175781, + "epoch": 0.6559629601971473, + "grad_norm": 1.0017951726913452, + "kl": 0.55908203125, + "learning_rate": 3.871580912757203e-07, + "loss": 0.0159, + "reward": 1.082589328289032, + "reward_std": 0.21637976542115211, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.9665178954601288, + "step": 2196 + }, + { + "completion_length": 905.9152069091797, + "epoch": 0.6562616682846688, + "grad_norm": 1.0898809432983398, + "kl": 0.64111328125, + "learning_rate": 3.867206279440455e-07, + "loss": 0.0148, + "reward": 1.0959821790456772, + "reward_std": 0.2734699547290802, + "rewards/accuracy_reward": 0.16294643841683865, + "rewards/format_reward": 0.9330357611179352, + "step": 2197 + }, + { + "completion_length": 785.2210083007812, + "epoch": 0.6565603763721902, + "grad_norm": 1.2329567670822144, + "kl": 0.51904296875, + "learning_rate": 3.8628334224432437e-07, + "loss": -0.0002, + "reward": 1.113839328289032, + "reward_std": 0.2387469932436943, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.9598214626312256, + "step": 2198 + }, + { + "completion_length": 772.044677734375, + "epoch": 0.6568590844597118, + "grad_norm": 1.105236291885376, + "kl": 0.583984375, + "learning_rate": 3.8584623465228094e-07, + "loss": 0.0071, + "reward": 1.2142857611179352, + "reward_std": 0.3255111798644066, + "rewards/accuracy_reward": 0.2544642984867096, + "rewards/format_reward": 0.9598214775323868, + "step": 2199 + }, + { + "completion_length": 778.8861999511719, + "epoch": 0.6571577925472332, + "grad_norm": 1.164563775062561, + "kl": 0.74609375, + "learning_rate": 3.854093056434453e-07, + "loss": 0.0217, + "reward": 1.0468750298023224, + "reward_std": 0.23871609941124916, + "rewards/accuracy_reward": 0.09598214691504836, + "rewards/format_reward": 0.9508928954601288, + "step": 2200 + }, + { + "completion_length": 856.0759429931641, + "epoch": 0.6574565006347547, + "grad_norm": 1.2033424377441406, + "kl": 0.5751953125, + "learning_rate": 3.849725556931537e-07, + "loss": 0.0149, + "reward": 1.0915178954601288, + "reward_std": 0.2676367610692978, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.9375000298023224, + "step": 2201 + }, + { + "completion_length": 756.7567291259766, + "epoch": 0.6577552087222761, + "grad_norm": 0.8562536239624023, + "kl": 0.60888671875, + "learning_rate": 3.8453598527654696e-07, + "loss": 0.034, + "reward": 1.1339285969734192, + "reward_std": 0.21190836280584335, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.9508928805589676, + "step": 2202 + }, + { + "completion_length": 837.9375457763672, + "epoch": 0.6580539168097976, + "grad_norm": 1.810488224029541, + "kl": 0.650390625, + "learning_rate": 3.8409959486857116e-07, + "loss": 0.006, + "reward": 1.0334822237491608, + "reward_std": 0.21661437675356865, + "rewards/accuracy_reward": 0.09375000232830644, + "rewards/format_reward": 0.9397321790456772, + "step": 2203 + }, + { + "completion_length": 753.7567291259766, + "epoch": 0.658352624897319, + "grad_norm": 1.2731496095657349, + "kl": 0.6337890625, + "learning_rate": 3.836633849439759e-07, + "loss": 0.0274, + "reward": 1.1160714775323868, + "reward_std": 0.2566990628838539, + "rewards/accuracy_reward": 0.15848214668221772, + "rewards/format_reward": 0.9575893133878708, + "step": 2204 + }, + { + "completion_length": 815.779052734375, + "epoch": 0.6586513329848406, + "grad_norm": 1.168194055557251, + "kl": 0.5986328125, + "learning_rate": 3.8322735597731526e-07, + "loss": 0.0008, + "reward": 1.0245535969734192, + "reward_std": 0.2774139232933521, + "rewards/accuracy_reward": 0.08705357741564512, + "rewards/format_reward": 0.9375000447034836, + "step": 2205 + }, + { + "completion_length": 796.2076263427734, + "epoch": 0.658950041072362, + "grad_norm": 1.0347726345062256, + "kl": 0.51806640625, + "learning_rate": 3.8279150844294595e-07, + "loss": 0.0112, + "reward": 1.0625000596046448, + "reward_std": 0.21262701973319054, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.9531250298023224, + "step": 2206 + }, + { + "completion_length": 870.2232513427734, + "epoch": 0.6592487491598835, + "grad_norm": 1.3554155826568604, + "kl": 0.51025390625, + "learning_rate": 3.8235584281502696e-07, + "loss": 0.0019, + "reward": 1.0334821939468384, + "reward_std": 0.25320588424801826, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.9486607611179352, + "step": 2207 + }, + { + "completion_length": 743.0201263427734, + "epoch": 0.6595474572474049, + "grad_norm": 1.2184053659439087, + "kl": 0.494140625, + "learning_rate": 3.8192035956752033e-07, + "loss": -0.0062, + "reward": 1.160714328289032, + "reward_std": 0.2837858460843563, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.9531250298023224, + "step": 2208 + }, + { + "completion_length": 794.5402221679688, + "epoch": 0.6598461653349265, + "grad_norm": 1.0476808547973633, + "kl": 0.5927734375, + "learning_rate": 3.814850591741889e-07, + "loss": 0.0004, + "reward": 1.1808036267757416, + "reward_std": 0.21734612062573433, + "rewards/accuracy_reward": 0.2142857238650322, + "rewards/format_reward": 0.9665178954601288, + "step": 2209 + }, + { + "completion_length": 846.8794860839844, + "epoch": 0.6601448734224479, + "grad_norm": 1.4911096096038818, + "kl": 0.751953125, + "learning_rate": 3.8104994210859687e-07, + "loss": 0.0067, + "reward": 1.1026786267757416, + "reward_std": 0.1777263507246971, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.957589328289032, + "step": 2210 + }, + { + "completion_length": 848.310302734375, + "epoch": 0.6604435815099694, + "grad_norm": 0.7863708734512329, + "kl": 0.5009765625, + "learning_rate": 3.806150088441087e-07, + "loss": 0.0266, + "reward": 1.0959821939468384, + "reward_std": 0.1966140829026699, + "rewards/accuracy_reward": 0.12946429150179029, + "rewards/format_reward": 0.9665178954601288, + "step": 2211 + }, + { + "completion_length": 875.9174499511719, + "epoch": 0.6607422895974908, + "grad_norm": 1.4332959651947021, + "kl": 0.45556640625, + "learning_rate": 3.8018025985388957e-07, + "loss": 0.0297, + "reward": 1.196428656578064, + "reward_std": 0.25910862907767296, + "rewards/accuracy_reward": 0.2209821492433548, + "rewards/format_reward": 0.9754464626312256, + "step": 2212 + }, + { + "completion_length": 798.3906707763672, + "epoch": 0.6610409976850123, + "grad_norm": 1.0266492366790771, + "kl": 0.6611328125, + "learning_rate": 3.7974569561090366e-07, + "loss": 0.0247, + "reward": 1.2299107611179352, + "reward_std": 0.1891281194984913, + "rewards/accuracy_reward": 0.2633928656578064, + "rewards/format_reward": 0.9665178954601288, + "step": 2213 + }, + { + "completion_length": 827.7210235595703, + "epoch": 0.6613397057725338, + "grad_norm": 1.2192158699035645, + "kl": 0.51416015625, + "learning_rate": 3.7931131658791406e-07, + "loss": 0.0334, + "reward": 1.0803571939468384, + "reward_std": 0.28314659744501114, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9531250447034836, + "step": 2214 + }, + { + "completion_length": 885.2701263427734, + "epoch": 0.6616384138600553, + "grad_norm": 1.3720203638076782, + "kl": 0.56884765625, + "learning_rate": 3.788771232574828e-07, + "loss": -0.0026, + "reward": 1.2299107611179352, + "reward_std": 0.307778749614954, + "rewards/accuracy_reward": 0.27232144493609667, + "rewards/format_reward": 0.957589328289032, + "step": 2215 + }, + { + "completion_length": 801.9620819091797, + "epoch": 0.6619371219475767, + "grad_norm": 1.0210853815078735, + "kl": 0.55224609375, + "learning_rate": 3.7844311609196964e-07, + "loss": 0.0011, + "reward": 1.1428571939468384, + "reward_std": 0.2526678554713726, + "rewards/accuracy_reward": 0.19866072572767735, + "rewards/format_reward": 0.9441964775323868, + "step": 2216 + }, + { + "completion_length": 821.7879791259766, + "epoch": 0.6622358300350982, + "grad_norm": 1.7034547328948975, + "kl": 0.450927734375, + "learning_rate": 3.780092955635318e-07, + "loss": 0.021, + "reward": 1.1116071939468384, + "reward_std": 0.23935559019446373, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9665178954601288, + "step": 2217 + }, + { + "completion_length": 844.0000457763672, + "epoch": 0.6625345381226196, + "grad_norm": 1.4468460083007812, + "kl": 0.45458984375, + "learning_rate": 3.775756621441233e-07, + "loss": 0.0204, + "reward": 1.1383928954601288, + "reward_std": 0.26263685896992683, + "rewards/accuracy_reward": 0.1830357275903225, + "rewards/format_reward": 0.9553571790456772, + "step": 2218 + }, + { + "completion_length": 727.1317138671875, + "epoch": 0.6628332462101412, + "grad_norm": 1.0976574420928955, + "kl": 0.6123046875, + "learning_rate": 3.7714221630549513e-07, + "loss": 0.0176, + "reward": 1.0781250298023224, + "reward_std": 0.24188192933797836, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.9508928954601288, + "step": 2219 + }, + { + "completion_length": 822.060302734375, + "epoch": 0.6631319542976626, + "grad_norm": 1.6909688711166382, + "kl": 0.46337890625, + "learning_rate": 3.767089585191937e-07, + "loss": 0.0514, + "reward": 1.1004465073347092, + "reward_std": 0.22302120178937912, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9598214775323868, + "step": 2220 + }, + { + "completion_length": 897.7567291259766, + "epoch": 0.6634306623851841, + "grad_norm": 1.0107601881027222, + "kl": 0.59326171875, + "learning_rate": 3.762758892565612e-07, + "loss": 0.0213, + "reward": 1.2477679252624512, + "reward_std": 0.2548036314547062, + "rewards/accuracy_reward": 0.2924107275903225, + "rewards/format_reward": 0.9553571939468384, + "step": 2221 + }, + { + "completion_length": 847.1897888183594, + "epoch": 0.6637293704727055, + "grad_norm": 1.0701662302017212, + "kl": 0.56787109375, + "learning_rate": 3.758430089887341e-07, + "loss": 0.0071, + "reward": 1.1026786267757416, + "reward_std": 0.27640533074736595, + "rewards/accuracy_reward": 0.14955358020961285, + "rewards/format_reward": 0.9531250596046448, + "step": 2222 + }, + { + "completion_length": 870.6562805175781, + "epoch": 0.664028078560227, + "grad_norm": 1.4319279193878174, + "kl": 0.455078125, + "learning_rate": 3.754103181866443e-07, + "loss": -0.0025, + "reward": 1.160714328289032, + "reward_std": 0.16847235150635242, + "rewards/accuracy_reward": 0.18750000861473382, + "rewards/format_reward": 0.973214328289032, + "step": 2223 + }, + { + "completion_length": 839.0848541259766, + "epoch": 0.6643267866477485, + "grad_norm": 1.0903375148773193, + "kl": 0.59619140625, + "learning_rate": 3.749778173210165e-07, + "loss": 0.0035, + "reward": 1.113839328289032, + "reward_std": 0.15455561690032482, + "rewards/accuracy_reward": 0.14732143771834671, + "rewards/format_reward": 0.9665178954601288, + "step": 2224 + }, + { + "completion_length": 835.2522583007812, + "epoch": 0.66462549473527, + "grad_norm": 1.0021915435791016, + "kl": 0.580078125, + "learning_rate": 3.745455068623694e-07, + "loss": 0.0128, + "reward": 1.1116071939468384, + "reward_std": 0.25659292563796043, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.9464286118745804, + "step": 2225 + }, + { + "completion_length": 848.0848693847656, + "epoch": 0.6649242028227914, + "grad_norm": 1.092597246170044, + "kl": 0.41552734375, + "learning_rate": 3.741133872810146e-07, + "loss": 0.0565, + "reward": 1.1584821939468384, + "reward_std": 0.25265203788876534, + "rewards/accuracy_reward": 0.1897321566939354, + "rewards/format_reward": 0.9687500447034836, + "step": 2226 + }, + { + "completion_length": 852.3705596923828, + "epoch": 0.6652229109103129, + "grad_norm": 0.8083257079124451, + "kl": 0.47705078125, + "learning_rate": 3.7368145904705564e-07, + "loss": 0.0233, + "reward": 1.116071492433548, + "reward_std": 0.21442830562591553, + "rewards/accuracy_reward": 0.16517858020961285, + "rewards/format_reward": 0.9508928954601288, + "step": 2227 + }, + { + "completion_length": 845.747802734375, + "epoch": 0.6655216189978344, + "grad_norm": 1.9367097616195679, + "kl": 0.6748046875, + "learning_rate": 3.732497226303881e-07, + "loss": 0.0303, + "reward": 1.2209821939468384, + "reward_std": 0.28966863825917244, + "rewards/accuracy_reward": 0.2723214440047741, + "rewards/format_reward": 0.9486607611179352, + "step": 2228 + }, + { + "completion_length": 843.9598541259766, + "epoch": 0.6658203270853559, + "grad_norm": 3.153247356414795, + "kl": 0.5205078125, + "learning_rate": 3.728181785006991e-07, + "loss": 0.0464, + "reward": 1.0892857611179352, + "reward_std": 0.22999022528529167, + "rewards/accuracy_reward": 0.14062500838190317, + "rewards/format_reward": 0.9486607611179352, + "step": 2229 + }, + { + "completion_length": 771.357177734375, + "epoch": 0.6661190351728773, + "grad_norm": 1.2517701387405396, + "kl": 0.343505859375, + "learning_rate": 3.7238682712746606e-07, + "loss": 0.0105, + "reward": 1.0848214626312256, + "reward_std": 0.3051018565893173, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.941964328289032, + "step": 2230 + }, + { + "completion_length": 795.6942291259766, + "epoch": 0.6664177432603988, + "grad_norm": 2.2871670722961426, + "kl": 0.578125, + "learning_rate": 3.719556689799572e-07, + "loss": 0.0575, + "reward": 1.176339328289032, + "reward_std": 0.24166937917470932, + "rewards/accuracy_reward": 0.2165178619325161, + "rewards/format_reward": 0.9598214775323868, + "step": 2231 + }, + { + "completion_length": 748.4375305175781, + "epoch": 0.6667164513479202, + "grad_norm": 1.276672124862671, + "kl": 0.375, + "learning_rate": 3.7152470452723015e-07, + "loss": 0.0028, + "reward": 1.1785714626312256, + "reward_std": 0.20294154062867165, + "rewards/accuracy_reward": 0.20982143841683865, + "rewards/format_reward": 0.9687500447034836, + "step": 2232 + }, + { + "completion_length": 818.8281555175781, + "epoch": 0.6670151594354418, + "grad_norm": 0.7750389575958252, + "kl": 0.50439453125, + "learning_rate": 3.710939342381324e-07, + "loss": 0.0045, + "reward": 1.0290178805589676, + "reward_std": 0.22991296648979187, + "rewards/accuracy_reward": 0.07589286123402417, + "rewards/format_reward": 0.9531250447034836, + "step": 2233 + }, + { + "completion_length": 936.5603179931641, + "epoch": 0.6673138675229632, + "grad_norm": 0.8069069385528564, + "kl": 0.57275390625, + "learning_rate": 3.7066335858129925e-07, + "loss": 0.0204, + "reward": 1.049107164144516, + "reward_std": 0.24818790704011917, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9464286118745804, + "step": 2234 + }, + { + "completion_length": 812.0245971679688, + "epoch": 0.6676125756104847, + "grad_norm": 0.8936922550201416, + "kl": 0.49365234375, + "learning_rate": 3.702329780251552e-07, + "loss": 0.0111, + "reward": 1.1696428954601288, + "reward_std": 0.1963910274207592, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9754464477300644, + "step": 2235 + }, + { + "completion_length": 895.8103179931641, + "epoch": 0.6679112836980061, + "grad_norm": 1.717203140258789, + "kl": 0.41064453125, + "learning_rate": 3.6980279303791193e-07, + "loss": -0.0146, + "reward": 1.0468750447034836, + "reward_std": 0.1788266934454441, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.9598214626312256, + "step": 2236 + }, + { + "completion_length": 804.2366485595703, + "epoch": 0.6682099917855276, + "grad_norm": 1.1638609170913696, + "kl": 0.58837890625, + "learning_rate": 3.693728040875688e-07, + "loss": -0.0157, + "reward": 1.0758928954601288, + "reward_std": 0.24640940874814987, + "rewards/accuracy_reward": 0.11383928917348385, + "rewards/format_reward": 0.9620536118745804, + "step": 2237 + }, + { + "completion_length": 904.0111999511719, + "epoch": 0.6685086998730491, + "grad_norm": 1.5329416990280151, + "kl": 0.5712890625, + "learning_rate": 3.689430116419112e-07, + "loss": 0.0271, + "reward": 1.0982143431901932, + "reward_std": 0.19686757773160934, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.9575893133878708, + "step": 2238 + }, + { + "completion_length": 798.6808319091797, + "epoch": 0.6688074079605706, + "grad_norm": 1.5516165494918823, + "kl": 0.52099609375, + "learning_rate": 3.685134161685115e-07, + "loss": 0.0488, + "reward": 1.20089291036129, + "reward_std": 0.32132588326931, + "rewards/accuracy_reward": 0.2500000037252903, + "rewards/format_reward": 0.9508928954601288, + "step": 2239 + }, + { + "completion_length": 867.5960235595703, + "epoch": 0.669106116048092, + "grad_norm": 1.4218155145645142, + "kl": 0.66845703125, + "learning_rate": 3.6808401813472754e-07, + "loss": 0.046, + "reward": 1.0736607909202576, + "reward_std": 0.22731849551200867, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.9553571790456772, + "step": 2240 + }, + { + "completion_length": 744.8616485595703, + "epoch": 0.6694048241356134, + "grad_norm": 1.3336325883865356, + "kl": 0.54931640625, + "learning_rate": 3.6765481800770217e-07, + "loss": 0.0427, + "reward": 1.1406250298023224, + "reward_std": 0.2471436783671379, + "rewards/accuracy_reward": 0.1763392984867096, + "rewards/format_reward": 0.964285746216774, + "step": 2241 + }, + { + "completion_length": 835.591552734375, + "epoch": 0.669703532223135, + "grad_norm": 0.9169252514839172, + "kl": 0.64794921875, + "learning_rate": 3.67225816254363e-07, + "loss": 0.0146, + "reward": 1.180803656578064, + "reward_std": 0.24599085003137589, + "rewards/accuracy_reward": 0.22544643469154835, + "rewards/format_reward": 0.9553571790456772, + "step": 2242 + }, + { + "completion_length": 702.919677734375, + "epoch": 0.6700022403106564, + "grad_norm": 1.3325724601745605, + "kl": 0.41064453125, + "learning_rate": 3.6679701334142177e-07, + "loss": 0.0499, + "reward": 1.2611607909202576, + "reward_std": 0.17031050473451614, + "rewards/accuracy_reward": 0.2946428656578064, + "rewards/format_reward": 0.96651791036129, + "step": 2243 + }, + { + "completion_length": 822.5268249511719, + "epoch": 0.6703009483981779, + "grad_norm": 1.1524571180343628, + "kl": 0.50048828125, + "learning_rate": 3.6636840973537443e-07, + "loss": 0.0382, + "reward": 1.098214328289032, + "reward_std": 0.21522748935967684, + "rewards/accuracy_reward": 0.14508929662406445, + "rewards/format_reward": 0.9531250447034836, + "step": 2244 + }, + { + "completion_length": 769.3504791259766, + "epoch": 0.6705996564856993, + "grad_norm": 1.0850114822387695, + "kl": 0.332275390625, + "learning_rate": 3.659400059024994e-07, + "loss": 0.0074, + "reward": 1.2187500298023224, + "reward_std": 0.24101493507623672, + "rewards/accuracy_reward": 0.25223216228187084, + "rewards/format_reward": 0.9665178954601288, + "step": 2245 + }, + { + "completion_length": 859.1652221679688, + "epoch": 0.6708983645732208, + "grad_norm": 1.4560341835021973, + "kl": 0.6484375, + "learning_rate": 3.6551180230885814e-07, + "loss": 0.0455, + "reward": 1.1361607611179352, + "reward_std": 0.19654385186731815, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9687500447034836, + "step": 2246 + }, + { + "completion_length": 820.6652221679688, + "epoch": 0.6711970726607422, + "grad_norm": 1.4867334365844727, + "kl": 0.4931640625, + "learning_rate": 3.650837994202942e-07, + "loss": 0.0211, + "reward": 1.178571492433548, + "reward_std": 0.20628372952342033, + "rewards/accuracy_reward": 0.2187500074505806, + "rewards/format_reward": 0.9598214775323868, + "step": 2247 + }, + { + "completion_length": 887.7545166015625, + "epoch": 0.6714957807482638, + "grad_norm": 1.0036463737487793, + "kl": 0.56005859375, + "learning_rate": 3.646559977024327e-07, + "loss": 0.0203, + "reward": 1.0245536267757416, + "reward_std": 0.24580496922135353, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.9531250447034836, + "step": 2248 + }, + { + "completion_length": 821.1339569091797, + "epoch": 0.6717944888357852, + "grad_norm": 0.8736220598220825, + "kl": 0.60693359375, + "learning_rate": 3.6422839762068016e-07, + "loss": 0.0401, + "reward": 1.1026786267757416, + "reward_std": 0.22044813632965088, + "rewards/accuracy_reward": 0.13616071827709675, + "rewards/format_reward": 0.9665178954601288, + "step": 2249 + }, + { + "completion_length": 788.7053985595703, + "epoch": 0.6720931969233067, + "grad_norm": 1.8133939504623413, + "kl": 0.74169921875, + "learning_rate": 3.638009996402233e-07, + "loss": 0.0158, + "reward": 1.1540179252624512, + "reward_std": 0.34043245762586594, + "rewards/accuracy_reward": 0.22098215110599995, + "rewards/format_reward": 0.9330357611179352, + "step": 2250 + }, + { + "completion_length": 850.0379943847656, + "epoch": 0.6723919050108281, + "grad_norm": 1.4996165037155151, + "kl": 0.89453125, + "learning_rate": 3.6337380422602935e-07, + "loss": 0.0396, + "reward": 1.1517857909202576, + "reward_std": 0.22887593507766724, + "rewards/accuracy_reward": 0.19419644260779023, + "rewards/format_reward": 0.9575893133878708, + "step": 2251 + }, + { + "completion_length": 869.232177734375, + "epoch": 0.6726906130983497, + "grad_norm": 1.0174270868301392, + "kl": 0.43408203125, + "learning_rate": 3.6294681184284514e-07, + "loss": 0.0029, + "reward": 1.084821492433548, + "reward_std": 0.21655553206801414, + "rewards/accuracy_reward": 0.13392857555299997, + "rewards/format_reward": 0.9508928954601288, + "step": 2252 + }, + { + "completion_length": 871.7411041259766, + "epoch": 0.6729893211858711, + "grad_norm": 0.9731213450431824, + "kl": 0.677734375, + "learning_rate": 3.625200229551966e-07, + "loss": 0.0305, + "reward": 1.1183036267757416, + "reward_std": 0.26449456438422203, + "rewards/accuracy_reward": 0.1651785746216774, + "rewards/format_reward": 0.9531250447034836, + "step": 2253 + }, + { + "completion_length": 851.3504943847656, + "epoch": 0.6732880292733926, + "grad_norm": 19.213655471801758, + "kl": 1.20703125, + "learning_rate": 3.6209343802738776e-07, + "loss": 0.0582, + "reward": 1.0669642984867096, + "reward_std": 0.2958991006016731, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.9375000447034836, + "step": 2254 + }, + { + "completion_length": 846.7567443847656, + "epoch": 0.673586737360914, + "grad_norm": 1.5371551513671875, + "kl": 0.7490234375, + "learning_rate": 3.6166705752350167e-07, + "loss": 0.0333, + "reward": 1.111607164144516, + "reward_std": 0.27906544879078865, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.9620536118745804, + "step": 2255 + }, + { + "completion_length": 895.9620819091797, + "epoch": 0.6738854454484355, + "grad_norm": 2.0265214443206787, + "kl": 0.763671875, + "learning_rate": 3.6124088190739843e-07, + "loss": 0.0174, + "reward": 1.053571492433548, + "reward_std": 0.22361263632774353, + "rewards/accuracy_reward": 0.10491071944124997, + "rewards/format_reward": 0.9486607611179352, + "step": 2256 + }, + { + "completion_length": 963.8549652099609, + "epoch": 0.674184153535957, + "grad_norm": 1.2740646600723267, + "kl": 0.58837890625, + "learning_rate": 3.6081491164271525e-07, + "loss": 0.0225, + "reward": 1.0803572088479996, + "reward_std": 0.222844410687685, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.9553571790456772, + "step": 2257 + }, + { + "completion_length": 839.7388763427734, + "epoch": 0.6744828616234785, + "grad_norm": 6.187489986419678, + "kl": 0.69140625, + "learning_rate": 3.6038914719286606e-07, + "loss": 0.0418, + "reward": 1.0803571939468384, + "reward_std": 0.1951339803636074, + "rewards/accuracy_reward": 0.12500000419095159, + "rewards/format_reward": 0.9553571790456772, + "step": 2258 + }, + { + "completion_length": 804.872802734375, + "epoch": 0.6747815697109999, + "grad_norm": 1.484492540359497, + "kl": 0.8330078125, + "learning_rate": 3.59963589021041e-07, + "loss": 0.0205, + "reward": 1.149553656578064, + "reward_std": 0.24071120098233223, + "rewards/accuracy_reward": 0.19866072572767735, + "rewards/format_reward": 0.95089291036129, + "step": 2259 + }, + { + "completion_length": 789.4107360839844, + "epoch": 0.6750802777985214, + "grad_norm": 3.219385862350464, + "kl": 0.5361328125, + "learning_rate": 3.595382375902053e-07, + "loss": 0.0606, + "reward": 1.15401791036129, + "reward_std": 0.20594284497201443, + "rewards/accuracy_reward": 0.1875000037252903, + "rewards/format_reward": 0.9665178954601288, + "step": 2260 + }, + { + "completion_length": 784.0647583007812, + "epoch": 0.6753789858860428, + "grad_norm": 1.2670762538909912, + "kl": 0.6767578125, + "learning_rate": 3.5911309336310004e-07, + "loss": 0.0473, + "reward": 1.1250000447034836, + "reward_std": 0.24465248361229897, + "rewards/accuracy_reward": 0.17410715040750802, + "rewards/format_reward": 0.9508928954601288, + "step": 2261 + }, + { + "completion_length": 918.841552734375, + "epoch": 0.6756776939735644, + "grad_norm": 0.950101375579834, + "kl": 0.53662109375, + "learning_rate": 3.5868815680224007e-07, + "loss": 0.0228, + "reward": 1.0312500298023224, + "reward_std": 0.24469802156090736, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.9508928954601288, + "step": 2262 + }, + { + "completion_length": 781.8906555175781, + "epoch": 0.6759764020610858, + "grad_norm": 0.7884002923965454, + "kl": 0.494140625, + "learning_rate": 3.582634283699151e-07, + "loss": 0.0171, + "reward": 1.1674107611179352, + "reward_std": 0.21077418699860573, + "rewards/accuracy_reward": 0.19196429196745157, + "rewards/format_reward": 0.9754464626312256, + "step": 2263 + }, + { + "completion_length": 871.8772888183594, + "epoch": 0.6762751101486073, + "grad_norm": 1.3628549575805664, + "kl": 0.6591796875, + "learning_rate": 3.5783890852818777e-07, + "loss": 0.0213, + "reward": 1.2053571790456772, + "reward_std": 0.32494477182626724, + "rewards/accuracy_reward": 0.25892858672887087, + "rewards/format_reward": 0.9464286267757416, + "step": 2264 + }, + { + "completion_length": 785.7567138671875, + "epoch": 0.6765738182361287, + "grad_norm": 1.2467848062515259, + "kl": 0.585205078125, + "learning_rate": 3.574145977388942e-07, + "loss": 0.0389, + "reward": 1.225446492433548, + "reward_std": 0.27623745426535606, + "rewards/accuracy_reward": 0.247767873108387, + "rewards/format_reward": 0.9776786267757416, + "step": 2265 + }, + { + "completion_length": 817.1205749511719, + "epoch": 0.6768725263236502, + "grad_norm": 0.8205963969230652, + "kl": 0.48193359375, + "learning_rate": 3.569904964636428e-07, + "loss": 0.0439, + "reward": 1.1294642984867096, + "reward_std": 0.19551634788513184, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9754464775323868, + "step": 2266 + }, + { + "completion_length": 851.3861999511719, + "epoch": 0.6771712344111717, + "grad_norm": 0.9083226919174194, + "kl": 0.5166015625, + "learning_rate": 3.565666051638144e-07, + "loss": 0.0052, + "reward": 1.1406250596046448, + "reward_std": 0.23840085417032242, + "rewards/accuracy_reward": 0.1875000037252903, + "rewards/format_reward": 0.9531250447034836, + "step": 2267 + }, + { + "completion_length": 823.1004791259766, + "epoch": 0.6774699424986932, + "grad_norm": 1.9600632190704346, + "kl": 0.92041015625, + "learning_rate": 3.5614292430056094e-07, + "loss": 0.0466, + "reward": 1.1674107611179352, + "reward_std": 0.2788166403770447, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.9486607611179352, + "step": 2268 + }, + { + "completion_length": 894.5826110839844, + "epoch": 0.6777686505862146, + "grad_norm": 1.4452954530715942, + "kl": 0.85546875, + "learning_rate": 3.5571945433480586e-07, + "loss": 0.0515, + "reward": 1.1562500596046448, + "reward_std": 0.2789771445095539, + "rewards/accuracy_reward": 0.1986607275903225, + "rewards/format_reward": 0.957589328289032, + "step": 2269 + }, + { + "completion_length": 886.357177734375, + "epoch": 0.6780673586737361, + "grad_norm": 1.1351715326309204, + "kl": 0.56689453125, + "learning_rate": 3.5529619572724303e-07, + "loss": 0.0363, + "reward": 1.1004464626312256, + "reward_std": 0.1679651029407978, + "rewards/accuracy_reward": 0.11607143562287092, + "rewards/format_reward": 0.9843750298023224, + "step": 2270 + }, + { + "completion_length": 844.2611999511719, + "epoch": 0.6783660667612575, + "grad_norm": 1.3915917873382568, + "kl": 0.5888671875, + "learning_rate": 3.548731489383361e-07, + "loss": 0.0583, + "reward": 1.0892857313156128, + "reward_std": 0.21995088644325733, + "rewards/accuracy_reward": 0.12500000116415322, + "rewards/format_reward": 0.964285746216774, + "step": 2271 + }, + { + "completion_length": 875.2902221679688, + "epoch": 0.6786647748487791, + "grad_norm": 2.0584309101104736, + "kl": 0.60693359375, + "learning_rate": 3.5445031442831876e-07, + "loss": 0.0094, + "reward": 1.1584822237491608, + "reward_std": 0.3039470873773098, + "rewards/accuracy_reward": 0.20982143469154835, + "rewards/format_reward": 0.948660746216774, + "step": 2272 + }, + { + "completion_length": 866.7701263427734, + "epoch": 0.6789634829363005, + "grad_norm": 0.6924609541893005, + "kl": 0.4951171875, + "learning_rate": 3.540276926571932e-07, + "loss": 0.0331, + "reward": 1.1160714626312256, + "reward_std": 0.1897125542163849, + "rewards/accuracy_reward": 0.14955357694998384, + "rewards/format_reward": 0.96651791036129, + "step": 2273 + }, + { + "completion_length": 859.9196929931641, + "epoch": 0.679262191023822, + "grad_norm": 1.5194066762924194, + "kl": 0.6796875, + "learning_rate": 3.5360528408473076e-07, + "loss": 0.0058, + "reward": 1.1339286267757416, + "reward_std": 0.23087770119309425, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.955357164144516, + "step": 2274 + }, + { + "completion_length": 856.0982513427734, + "epoch": 0.6795608991113434, + "grad_norm": 1.5075721740722656, + "kl": 0.69677734375, + "learning_rate": 3.531830891704707e-07, + "loss": 0.0204, + "reward": 1.0937500596046448, + "reward_std": 0.2548533007502556, + "rewards/accuracy_reward": 0.1406250111758709, + "rewards/format_reward": 0.9531250298023224, + "step": 2275 + }, + { + "completion_length": 844.8036041259766, + "epoch": 0.679859607198865, + "grad_norm": 2.2824583053588867, + "kl": 0.81591796875, + "learning_rate": 3.527611083737192e-07, + "loss": 0.0234, + "reward": 1.0937500596046448, + "reward_std": 0.24190940335392952, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.95089291036129, + "step": 2276 + }, + { + "completion_length": 802.5580749511719, + "epoch": 0.6801583152863864, + "grad_norm": 1.049058198928833, + "kl": 0.537109375, + "learning_rate": 3.5233934215355033e-07, + "loss": 0.0142, + "reward": 1.2053572237491608, + "reward_std": 0.2540353797376156, + "rewards/accuracy_reward": 0.2410714365541935, + "rewards/format_reward": 0.964285746216774, + "step": 2277 + }, + { + "completion_length": 805.8437957763672, + "epoch": 0.6804570233739079, + "grad_norm": 1.3069034814834595, + "kl": 0.869140625, + "learning_rate": 3.519177909688046e-07, + "loss": 0.0549, + "reward": 1.1316964626312256, + "reward_std": 0.25325799733400345, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.9531250447034836, + "step": 2278 + }, + { + "completion_length": 813.3303985595703, + "epoch": 0.6807557314614293, + "grad_norm": 1.0835779905319214, + "kl": 0.6083984375, + "learning_rate": 3.514964552780879e-07, + "loss": 0.0096, + "reward": 1.0915179252624512, + "reward_std": 0.23458043299615383, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.9642857611179352, + "step": 2279 + }, + { + "completion_length": 827.1920013427734, + "epoch": 0.6810544395489508, + "grad_norm": 0.8161711096763611, + "kl": 0.6337890625, + "learning_rate": 3.5107533553977244e-07, + "loss": 0.0213, + "reward": 1.194196492433548, + "reward_std": 0.20195953641086817, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.98214291036129, + "step": 2280 + }, + { + "completion_length": 859.8616333007812, + "epoch": 0.6813531476364723, + "grad_norm": 0.6372631192207336, + "kl": 0.5654296875, + "learning_rate": 3.506544322119954e-07, + "loss": 0.0005, + "reward": 1.1584821790456772, + "reward_std": 0.18902800232172012, + "rewards/accuracy_reward": 0.18973215040750802, + "rewards/format_reward": 0.9687500298023224, + "step": 2281 + }, + { + "completion_length": 840.3214569091797, + "epoch": 0.6816518557239938, + "grad_norm": 0.8909034132957458, + "kl": 0.68017578125, + "learning_rate": 3.50233745752658e-07, + "loss": -0.0133, + "reward": 1.0647321939468384, + "reward_std": 0.24218735471367836, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.9598214626312256, + "step": 2282 + }, + { + "completion_length": 781.4442138671875, + "epoch": 0.6819505638115152, + "grad_norm": 1.6632649898529053, + "kl": 0.6435546875, + "learning_rate": 3.4981327661942594e-07, + "loss": 0.0159, + "reward": 1.2008929252624512, + "reward_std": 0.20267204567790031, + "rewards/accuracy_reward": 0.22544644214212894, + "rewards/format_reward": 0.9754464775323868, + "step": 2283 + }, + { + "completion_length": 837.0714721679688, + "epoch": 0.6822492718990366, + "grad_norm": 0.9626357555389404, + "kl": 0.59619140625, + "learning_rate": 3.4939302526972874e-07, + "loss": 0.0302, + "reward": 1.1138393431901932, + "reward_std": 0.22652501799166203, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.957589328289032, + "step": 2284 + }, + { + "completion_length": 973.4687957763672, + "epoch": 0.6825479799865581, + "grad_norm": 1.1346063613891602, + "kl": 0.439208984375, + "learning_rate": 3.489729921607583e-07, + "loss": 0.0316, + "reward": 1.1383929252624512, + "reward_std": 0.25307031348347664, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.9598214775323868, + "step": 2285 + }, + { + "completion_length": 788.9286193847656, + "epoch": 0.6828466880740796, + "grad_norm": 0.9839189648628235, + "kl": 0.66015625, + "learning_rate": 3.485531777494698e-07, + "loss": 0.0185, + "reward": 1.1227678954601288, + "reward_std": 0.21691285446286201, + "rewards/accuracy_reward": 0.16294643841683865, + "rewards/format_reward": 0.9598214775323868, + "step": 2286 + }, + { + "completion_length": 894.9174652099609, + "epoch": 0.6831453961616011, + "grad_norm": 1.2366605997085571, + "kl": 0.52490234375, + "learning_rate": 3.4813358249257973e-07, + "loss": 0.0395, + "reward": 1.0491071790456772, + "reward_std": 0.22930796071887016, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.9508928954601288, + "step": 2287 + }, + { + "completion_length": 844.3192291259766, + "epoch": 0.6834441042491225, + "grad_norm": 1.6227766275405884, + "kl": 0.552734375, + "learning_rate": 3.47714206846567e-07, + "loss": 0.0254, + "reward": 1.2008928954601288, + "reward_std": 0.22947021573781967, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9687500596046448, + "step": 2288 + }, + { + "completion_length": 891.0268249511719, + "epoch": 0.683742812336644, + "grad_norm": 1.0604673624038696, + "kl": 0.5654296875, + "learning_rate": 3.472950512676712e-07, + "loss": 0.0157, + "reward": 1.1406250298023224, + "reward_std": 0.21417466551065445, + "rewards/accuracy_reward": 0.18303572572767735, + "rewards/format_reward": 0.9575893431901932, + "step": 2289 + }, + { + "completion_length": 873.8794860839844, + "epoch": 0.6840415204241654, + "grad_norm": 1.3052457571029663, + "kl": 0.43408203125, + "learning_rate": 3.4687611621189216e-07, + "loss": 0.0071, + "reward": 1.15401791036129, + "reward_std": 0.1459372527897358, + "rewards/accuracy_reward": 0.18526786309666932, + "rewards/format_reward": 0.9687500298023224, + "step": 2290 + }, + { + "completion_length": 773.1406555175781, + "epoch": 0.684340228511687, + "grad_norm": 1.3318917751312256, + "kl": 0.865234375, + "learning_rate": 3.4645740213499047e-07, + "loss": 0.035, + "reward": 1.209821492433548, + "reward_std": 0.22496242821216583, + "rewards/accuracy_reward": 0.2299107201397419, + "rewards/format_reward": 0.9799107611179352, + "step": 2291 + }, + { + "completion_length": 908.2902069091797, + "epoch": 0.6846389365992084, + "grad_norm": 1.2180023193359375, + "kl": 0.52294921875, + "learning_rate": 3.460389094924861e-07, + "loss": 0.0424, + "reward": 1.1964286267757416, + "reward_std": 0.2773685120046139, + "rewards/accuracy_reward": 0.21651786752045155, + "rewards/format_reward": 0.979910746216774, + "step": 2292 + }, + { + "completion_length": 847.9844055175781, + "epoch": 0.6849376446867299, + "grad_norm": 0.9941182732582092, + "kl": 0.34716796875, + "learning_rate": 3.4562063873965774e-07, + "loss": 0.0355, + "reward": 1.2187500298023224, + "reward_std": 0.20681234635412693, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.9821428805589676, + "step": 2293 + }, + { + "completion_length": 851.5602874755859, + "epoch": 0.6852363527742513, + "grad_norm": 1.2070655822753906, + "kl": 0.61962890625, + "learning_rate": 3.4520259033154296e-07, + "loss": -0.0109, + "reward": 1.111607164144516, + "reward_std": 0.2614549249410629, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.9642857611179352, + "step": 2294 + }, + { + "completion_length": 773.6205902099609, + "epoch": 0.6855350608617728, + "grad_norm": 1.0060524940490723, + "kl": 0.43017578125, + "learning_rate": 3.447847647229379e-07, + "loss": -0.0019, + "reward": 1.1428571939468384, + "reward_std": 0.20369491074234247, + "rewards/accuracy_reward": 0.16964285750873387, + "rewards/format_reward": 0.9732143133878708, + "step": 2295 + }, + { + "completion_length": 753.8482513427734, + "epoch": 0.6858337689492943, + "grad_norm": 2.1496355533599854, + "kl": 0.765625, + "learning_rate": 3.443671623683954e-07, + "loss": 0.0149, + "reward": 1.1852679252624512, + "reward_std": 0.24227412417531013, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.9620535969734192, + "step": 2296 + }, + { + "completion_length": 748.1428833007812, + "epoch": 0.6861324770368158, + "grad_norm": 0.6634538769721985, + "kl": 0.437255859375, + "learning_rate": 3.43949783722226e-07, + "loss": -0.0019, + "reward": 1.2678571939468384, + "reward_std": 0.14790172968059778, + "rewards/accuracy_reward": 0.2857143022119999, + "rewards/format_reward": 0.9821428954601288, + "step": 2297 + }, + { + "completion_length": 892.8928833007812, + "epoch": 0.6864311851243372, + "grad_norm": 0.8538684844970703, + "kl": 0.55078125, + "learning_rate": 3.43532629238497e-07, + "loss": 0.0232, + "reward": 1.080357164144516, + "reward_std": 0.21495374105870724, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.9642857611179352, + "step": 2298 + }, + { + "completion_length": 932.9397735595703, + "epoch": 0.6867298932118587, + "grad_norm": 1.2732393741607666, + "kl": 0.58349609375, + "learning_rate": 3.431156993710312e-07, + "loss": 0.0459, + "reward": 1.1026786118745804, + "reward_std": 0.1840931922197342, + "rewards/accuracy_reward": 0.13169643143191934, + "rewards/format_reward": 0.9709821790456772, + "step": 2299 + }, + { + "completion_length": 813.7187957763672, + "epoch": 0.6870286012993801, + "grad_norm": 1.6055301427841187, + "kl": 0.49072265625, + "learning_rate": 3.42698994573408e-07, + "loss": 0.0384, + "reward": 1.1205357611179352, + "reward_std": 0.21967968717217445, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.9687500447034836, + "step": 2300 + }, + { + "completion_length": 798.4620971679688, + "epoch": 0.6873273093869017, + "grad_norm": 1.1361560821533203, + "kl": 0.47509765625, + "learning_rate": 3.422825152989606e-07, + "loss": 0.0354, + "reward": 1.1205357611179352, + "reward_std": 0.21736561134457588, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9754464775323868, + "step": 2301 + }, + { + "completion_length": 803.5670013427734, + "epoch": 0.6876260174744231, + "grad_norm": 0.6067108511924744, + "kl": 0.396484375, + "learning_rate": 3.418662620007782e-07, + "loss": 0.0079, + "reward": 1.147321492433548, + "reward_std": 0.1505634505301714, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.9821428954601288, + "step": 2302 + }, + { + "completion_length": 854.3013916015625, + "epoch": 0.6879247255619446, + "grad_norm": 2.3292627334594727, + "kl": 0.568359375, + "learning_rate": 3.4145023513170355e-07, + "loss": 0.0223, + "reward": 1.1651786267757416, + "reward_std": 0.25655287876725197, + "rewards/accuracy_reward": 0.20312500931322575, + "rewards/format_reward": 0.9620536267757416, + "step": 2303 + }, + { + "completion_length": 835.0312805175781, + "epoch": 0.688223433649466, + "grad_norm": 0.7953203320503235, + "kl": 0.63427734375, + "learning_rate": 3.410344351443329e-07, + "loss": 0.0123, + "reward": 1.1093750596046448, + "reward_std": 0.16620290651917458, + "rewards/accuracy_reward": 0.13392858067527413, + "rewards/format_reward": 0.9754464626312256, + "step": 2304 + }, + { + "completion_length": 867.763427734375, + "epoch": 0.6885221417369876, + "grad_norm": 0.7287294864654541, + "kl": 0.41162109375, + "learning_rate": 3.4061886249101594e-07, + "loss": 0.0169, + "reward": 1.225446492433548, + "reward_std": 0.20212632790207863, + "rewards/accuracy_reward": 0.23883929662406445, + "rewards/format_reward": 0.9866071790456772, + "step": 2305 + }, + { + "completion_length": 751.1763610839844, + "epoch": 0.688820849824509, + "grad_norm": 1.261157751083374, + "kl": 0.46826171875, + "learning_rate": 3.4020351762385527e-07, + "loss": 0.011, + "reward": 1.2321429252624512, + "reward_std": 0.17762434668838978, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.9821428954601288, + "step": 2306 + }, + { + "completion_length": 921.0803833007812, + "epoch": 0.6891195579120305, + "grad_norm": 1.9408632516860962, + "kl": 0.5556640625, + "learning_rate": 3.397884009947049e-07, + "loss": 0.0392, + "reward": 1.1361607760190964, + "reward_std": 0.19303284585475922, + "rewards/accuracy_reward": 0.16071429220028222, + "rewards/format_reward": 0.9754464626312256, + "step": 2307 + }, + { + "completion_length": 813.0625305175781, + "epoch": 0.6894182659995519, + "grad_norm": 1.3329918384552002, + "kl": 0.370849609375, + "learning_rate": 3.3937351305517137e-07, + "loss": -0.0013, + "reward": 1.1741071939468384, + "reward_std": 0.2061011316254735, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9843750447034836, + "step": 2308 + }, + { + "completion_length": 865.3259429931641, + "epoch": 0.6897169740870734, + "grad_norm": 1.126358151435852, + "kl": 0.5126953125, + "learning_rate": 3.3895885425661206e-07, + "loss": 0.0033, + "reward": 1.1808036267757416, + "reward_std": 0.27530017495155334, + "rewards/accuracy_reward": 0.20982143469154835, + "rewards/format_reward": 0.9709821790456772, + "step": 2309 + }, + { + "completion_length": 827.6451110839844, + "epoch": 0.6900156821745949, + "grad_norm": 1.293604850769043, + "kl": 0.49462890625, + "learning_rate": 3.38544425050135e-07, + "loss": 0.009, + "reward": 1.1450893431901932, + "reward_std": 0.21894368529319763, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.9709821939468384, + "step": 2310 + }, + { + "completion_length": 782.6495971679688, + "epoch": 0.6903143902621164, + "grad_norm": 0.9811223745346069, + "kl": 0.38525390625, + "learning_rate": 3.3813022588659864e-07, + "loss": -0.0042, + "reward": 1.1517857909202576, + "reward_std": 0.18674732092767954, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.9754464477300644, + "step": 2311 + }, + { + "completion_length": 862.0312805175781, + "epoch": 0.6906130983496378, + "grad_norm": 1.8549565076828003, + "kl": 0.5048828125, + "learning_rate": 3.3771625721661116e-07, + "loss": 0.0136, + "reward": 1.1361607611179352, + "reward_std": 0.220906812697649, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.9754464626312256, + "step": 2312 + }, + { + "completion_length": 870.0937805175781, + "epoch": 0.6909118064371593, + "grad_norm": 1.252545714378357, + "kl": 0.3828125, + "learning_rate": 3.3730251949052966e-07, + "loss": 0.0163, + "reward": 1.1919643580913544, + "reward_std": 0.21463205479085445, + "rewards/accuracy_reward": 0.21651786379516125, + "rewards/format_reward": 0.9754464775323868, + "step": 2313 + }, + { + "completion_length": 818.6205596923828, + "epoch": 0.6912105145246807, + "grad_norm": 2.0914437770843506, + "kl": 0.439208984375, + "learning_rate": 3.3688901315846045e-07, + "loss": 0.0176, + "reward": 1.1093750596046448, + "reward_std": 0.18298102170228958, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.988839328289032, + "step": 2314 + }, + { + "completion_length": 815.3326263427734, + "epoch": 0.6915092226122023, + "grad_norm": 1.8742985725402832, + "kl": 0.6826171875, + "learning_rate": 3.364757386702577e-07, + "loss": 0.0141, + "reward": 1.1919643580913544, + "reward_std": 0.19001947715878487, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.9732143133878708, + "step": 2315 + }, + { + "completion_length": 847.8192138671875, + "epoch": 0.6918079306997237, + "grad_norm": 1.6587930917739868, + "kl": 0.6416015625, + "learning_rate": 3.3606269647552365e-07, + "loss": 0.0563, + "reward": 1.145089328289032, + "reward_std": 0.26480991765856743, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9575893133878708, + "step": 2316 + }, + { + "completion_length": 888.4487152099609, + "epoch": 0.6921066387872452, + "grad_norm": 2.0171287059783936, + "kl": 0.572265625, + "learning_rate": 3.3564988702360785e-07, + "loss": 0.0284, + "reward": 1.1272321939468384, + "reward_std": 0.2949130944907665, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9575893133878708, + "step": 2317 + }, + { + "completion_length": 909.6652221679688, + "epoch": 0.6924053468747666, + "grad_norm": 0.9749118685722351, + "kl": 0.58251953125, + "learning_rate": 3.352373107636063e-07, + "loss": 0.0089, + "reward": 1.1562500596046448, + "reward_std": 0.21899260953068733, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9754464626312256, + "step": 2318 + }, + { + "completion_length": 944.3460235595703, + "epoch": 0.6927040549622882, + "grad_norm": 1.224355697631836, + "kl": 0.521484375, + "learning_rate": 3.3482496814436157e-07, + "loss": 0.0256, + "reward": 1.1361607611179352, + "reward_std": 0.184027761220932, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9843750447034836, + "step": 2319 + }, + { + "completion_length": 867.0089416503906, + "epoch": 0.6930027630498096, + "grad_norm": 1.5004258155822754, + "kl": 0.55419921875, + "learning_rate": 3.344128596144623e-07, + "loss": 0.0403, + "reward": 1.1406250596046448, + "reward_std": 0.22727175429463387, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9732143431901932, + "step": 2320 + }, + { + "completion_length": 892.2902221679688, + "epoch": 0.6933014711373311, + "grad_norm": 1.1081782579421997, + "kl": 0.48046875, + "learning_rate": 3.340009856222417e-07, + "loss": 0.017, + "reward": 1.1160714477300644, + "reward_std": 0.24597933888435364, + "rewards/accuracy_reward": 0.147321441443637, + "rewards/format_reward": 0.9687500447034836, + "step": 2321 + }, + { + "completion_length": 808.9286041259766, + "epoch": 0.6936001792248525, + "grad_norm": 0.6905144453048706, + "kl": 0.51171875, + "learning_rate": 3.3358934661577863e-07, + "loss": 0.0162, + "reward": 1.1116071939468384, + "reward_std": 0.15566634014248848, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.9888392984867096, + "step": 2322 + }, + { + "completion_length": 826.5692443847656, + "epoch": 0.693898887312374, + "grad_norm": 1.4040420055389404, + "kl": 0.623046875, + "learning_rate": 3.331779430428961e-07, + "loss": 0.0338, + "reward": 1.2834822237491608, + "reward_std": 0.21574904210865498, + "rewards/accuracy_reward": 0.3258928693830967, + "rewards/format_reward": 0.957589328289032, + "step": 2323 + }, + { + "completion_length": 822.779052734375, + "epoch": 0.6941975953998955, + "grad_norm": 0.6687338948249817, + "kl": 0.529296875, + "learning_rate": 3.3276677535116047e-07, + "loss": 0.0143, + "reward": 1.113839328289032, + "reward_std": 0.171676361002028, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.979910746216774, + "step": 2324 + }, + { + "completion_length": 853.8839569091797, + "epoch": 0.694496303487417, + "grad_norm": 1.0485241413116455, + "kl": 0.4072265625, + "learning_rate": 3.323558439878822e-07, + "loss": 0.038, + "reward": 1.1227679550647736, + "reward_std": 0.1538854781538248, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.9799107611179352, + "step": 2325 + }, + { + "completion_length": 766.8348541259766, + "epoch": 0.6947950115749384, + "grad_norm": 1.8852847814559937, + "kl": 0.7822265625, + "learning_rate": 3.3194514940011437e-07, + "loss": 0.001, + "reward": 1.1584821939468384, + "reward_std": 0.23020849749445915, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9709821790456772, + "step": 2326 + }, + { + "completion_length": 886.2946624755859, + "epoch": 0.6950937196624598, + "grad_norm": 1.4585844278335571, + "kl": 0.636962890625, + "learning_rate": 3.315346920346521e-07, + "loss": 0.0028, + "reward": 1.1830357611179352, + "reward_std": 0.25914542749524117, + "rewards/accuracy_reward": 0.2098214402794838, + "rewards/format_reward": 0.973214328289032, + "step": 2327 + }, + { + "completion_length": 935.1161193847656, + "epoch": 0.6953924277499813, + "grad_norm": 1.6094329357147217, + "kl": 0.7314453125, + "learning_rate": 3.311244723380332e-07, + "loss": 0.0192, + "reward": 1.1004464626312256, + "reward_std": 0.18182696402072906, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9687500298023224, + "step": 2328 + }, + { + "completion_length": 880.935302734375, + "epoch": 0.6956911358375027, + "grad_norm": 1.8168244361877441, + "kl": 0.536376953125, + "learning_rate": 3.3071449075653617e-07, + "loss": 0.0327, + "reward": 1.1651785969734192, + "reward_std": 0.2156098671257496, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9709821790456772, + "step": 2329 + }, + { + "completion_length": 800.8616485595703, + "epoch": 0.6959898439250243, + "grad_norm": 1.1165692806243896, + "kl": 0.6083984375, + "learning_rate": 3.303047477361809e-07, + "loss": 0.0189, + "reward": 1.1830357909202576, + "reward_std": 0.16314507368952036, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.979910746216774, + "step": 2330 + }, + { + "completion_length": 910.1764068603516, + "epoch": 0.6962885520125457, + "grad_norm": 0.9804737567901611, + "kl": 0.44189453125, + "learning_rate": 3.298952437227278e-07, + "loss": 0.0297, + "reward": 1.176339328289032, + "reward_std": 0.26433293521404266, + "rewards/accuracy_reward": 0.20535715110599995, + "rewards/format_reward": 0.9709821790456772, + "step": 2331 + }, + { + "completion_length": 908.1384429931641, + "epoch": 0.6965872601000672, + "grad_norm": 1.5906294584274292, + "kl": 0.71728515625, + "learning_rate": 3.2948597916167677e-07, + "loss": 0.0615, + "reward": 1.2187500298023224, + "reward_std": 0.262005515396595, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.9687500298023224, + "step": 2332 + }, + { + "completion_length": 932.0000457763672, + "epoch": 0.6968859681875886, + "grad_norm": 1.6273022890090942, + "kl": 0.9482421875, + "learning_rate": 3.2907695449826766e-07, + "loss": 0.0749, + "reward": 1.1361607313156128, + "reward_std": 0.2422085925936699, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9598214775323868, + "step": 2333 + }, + { + "completion_length": 831.7924652099609, + "epoch": 0.6971846762751102, + "grad_norm": 2.4139652252197266, + "kl": 0.71533203125, + "learning_rate": 3.2866817017747947e-07, + "loss": 0.0496, + "reward": 1.1517857909202576, + "reward_std": 0.2113184705376625, + "rewards/accuracy_reward": 0.17187501303851604, + "rewards/format_reward": 0.9799107611179352, + "step": 2334 + }, + { + "completion_length": 899.7679138183594, + "epoch": 0.6974833843626316, + "grad_norm": 2.007594585418701, + "kl": 0.4443359375, + "learning_rate": 3.2825962664402914e-07, + "loss": 0.0232, + "reward": 1.116071492433548, + "reward_std": 0.20958175137639046, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.9620536267757416, + "step": 2335 + }, + { + "completion_length": 855.0870971679688, + "epoch": 0.6977820924501531, + "grad_norm": 1.185710072517395, + "kl": 0.4228515625, + "learning_rate": 3.2785132434237215e-07, + "loss": 0.0067, + "reward": 1.1964285969734192, + "reward_std": 0.19628487527370453, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9843750298023224, + "step": 2336 + }, + { + "completion_length": 854.8170013427734, + "epoch": 0.6980808005376745, + "grad_norm": 4.816127300262451, + "kl": 0.55517578125, + "learning_rate": 3.2744326371670153e-07, + "loss": 0.0619, + "reward": 1.0825893431901932, + "reward_std": 0.18661053478717804, + "rewards/accuracy_reward": 0.10714286169968545, + "rewards/format_reward": 0.9754464775323868, + "step": 2337 + }, + { + "completion_length": 868.4263610839844, + "epoch": 0.698379508625196, + "grad_norm": 0.935210108757019, + "kl": 0.51025390625, + "learning_rate": 3.270354452109468e-07, + "loss": 0.0052, + "reward": 1.1272321939468384, + "reward_std": 0.18389732390642166, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.979910746216774, + "step": 2338 + }, + { + "completion_length": 983.7946929931641, + "epoch": 0.6986782167127175, + "grad_norm": 1.5925599336624146, + "kl": 0.4091796875, + "learning_rate": 3.2662786926877494e-07, + "loss": 0.0278, + "reward": 1.0915178954601288, + "reward_std": 0.19228172302246094, + "rewards/accuracy_reward": 0.12053572246804833, + "rewards/format_reward": 0.9709821790456772, + "step": 2339 + }, + { + "completion_length": 789.0870971679688, + "epoch": 0.698976924800239, + "grad_norm": 2.2822134494781494, + "kl": 0.47265625, + "learning_rate": 3.262205363335885e-07, + "loss": 0.0253, + "reward": 1.1183035969734192, + "reward_std": 0.24180616810917854, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9709821790456772, + "step": 2340 + }, + { + "completion_length": 836.7812805175781, + "epoch": 0.6992756328877604, + "grad_norm": 0.8633013367652893, + "kl": 0.44287109375, + "learning_rate": 3.258134468485258e-07, + "loss": 0.0168, + "reward": 1.176339328289032, + "reward_std": 0.12508991546928883, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9888393133878708, + "step": 2341 + }, + { + "completion_length": 837.4821929931641, + "epoch": 0.6995743409752819, + "grad_norm": 1.1277320384979248, + "kl": 0.5546875, + "learning_rate": 3.2540660125646035e-07, + "loss": 0.0266, + "reward": 1.0625000298023224, + "reward_std": 0.18268887884914875, + "rewards/accuracy_reward": 0.08928571850992739, + "rewards/format_reward": 0.973214328289032, + "step": 2342 + }, + { + "completion_length": 844.2187805175781, + "epoch": 0.6998730490628033, + "grad_norm": 1.4829612970352173, + "kl": 0.59716796875, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0269, + "reward": 1.254464328289032, + "reward_std": 0.205971360206604, + "rewards/accuracy_reward": 0.2834821566939354, + "rewards/format_reward": 0.9709821790456772, + "step": 2343 + }, + { + "completion_length": 867.8683471679688, + "epoch": 0.7001717571503249, + "grad_norm": 1.6924748420715332, + "kl": 0.560546875, + "learning_rate": 3.2459364352148743e-07, + "loss": 0.0329, + "reward": 1.1227678954601288, + "reward_std": 0.2271850649267435, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9642857611179352, + "step": 2344 + }, + { + "completion_length": 903.2567443847656, + "epoch": 0.7004704652378463, + "grad_norm": 1.3884226083755493, + "kl": 0.4462890625, + "learning_rate": 3.2418753226299853e-07, + "loss": 0.0112, + "reward": 1.176339328289032, + "reward_std": 0.25347778759896755, + "rewards/accuracy_reward": 0.20535715483129025, + "rewards/format_reward": 0.9709821790456772, + "step": 2345 + }, + { + "completion_length": 927.6049652099609, + "epoch": 0.7007691733253678, + "grad_norm": 1.6703687906265259, + "kl": 0.51123046875, + "learning_rate": 3.2378166666634257e-07, + "loss": 0.0426, + "reward": 1.09151791036129, + "reward_std": 0.1813963744789362, + "rewards/accuracy_reward": 0.12276786169968545, + "rewards/format_reward": 0.9687500298023224, + "step": 2346 + }, + { + "completion_length": 938.1183471679688, + "epoch": 0.7010678814128892, + "grad_norm": 1.2815426588058472, + "kl": 0.52880859375, + "learning_rate": 3.233760471730613e-07, + "loss": 0.0216, + "reward": 1.1049107313156128, + "reward_std": 0.21555739641189575, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.979910746216774, + "step": 2347 + }, + { + "completion_length": 911.6116485595703, + "epoch": 0.7013665895004108, + "grad_norm": 1.4933050870895386, + "kl": 0.457275390625, + "learning_rate": 3.2297067422442937e-07, + "loss": 0.0494, + "reward": 1.1049107611179352, + "reward_std": 0.14504992961883545, + "rewards/accuracy_reward": 0.11830357555299997, + "rewards/format_reward": 0.9866071790456772, + "step": 2348 + }, + { + "completion_length": 795.1094055175781, + "epoch": 0.7016652975879322, + "grad_norm": 0.8199655413627625, + "kl": 0.46630859375, + "learning_rate": 3.2256554826145255e-07, + "loss": 0.0237, + "reward": 1.1741071939468384, + "reward_std": 0.24326810240745544, + "rewards/accuracy_reward": 0.20089286752045155, + "rewards/format_reward": 0.9732143431901932, + "step": 2349 + }, + { + "completion_length": 886.9487152099609, + "epoch": 0.7019640056754537, + "grad_norm": 1.2605684995651245, + "kl": 0.57568359375, + "learning_rate": 3.221606697248681e-07, + "loss": 0.0301, + "reward": 1.1071429252624512, + "reward_std": 0.2011292278766632, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9754464775323868, + "step": 2350 + }, + { + "completion_length": 854.9062957763672, + "epoch": 0.7022627137629751, + "grad_norm": 1.0345968008041382, + "kl": 0.37353515625, + "learning_rate": 3.2175603905514457e-07, + "loss": 0.0172, + "reward": 1.0915179252624512, + "reward_std": 0.17091221548616886, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.9866071939468384, + "step": 2351 + }, + { + "completion_length": 763.4576263427734, + "epoch": 0.7025614218504966, + "grad_norm": 2.4299893379211426, + "kl": 0.494140625, + "learning_rate": 3.213516566924801e-07, + "loss": -0.0012, + "reward": 1.303571492433548, + "reward_std": 0.2709711976349354, + "rewards/accuracy_reward": 0.3325893059372902, + "rewards/format_reward": 0.9709821790456772, + "step": 2352 + }, + { + "completion_length": 780.8437805175781, + "epoch": 0.702860129938018, + "grad_norm": 1.4309865236282349, + "kl": 0.59521484375, + "learning_rate": 3.209475230768034e-07, + "loss": 0.0384, + "reward": 1.2366071939468384, + "reward_std": 0.1946774758398533, + "rewards/accuracy_reward": 0.2589285895228386, + "rewards/format_reward": 0.9776786118745804, + "step": 2353 + }, + { + "completion_length": 907.9062957763672, + "epoch": 0.7031588380255396, + "grad_norm": 1.429340124130249, + "kl": 0.755859375, + "learning_rate": 3.205436386477718e-07, + "loss": 0.035, + "reward": 1.1093750596046448, + "reward_std": 0.22846924513578415, + "rewards/accuracy_reward": 0.14508929662406445, + "rewards/format_reward": 0.9642857611179352, + "step": 2354 + }, + { + "completion_length": 772.6875457763672, + "epoch": 0.703457546113061, + "grad_norm": 2.113844394683838, + "kl": 0.4521484375, + "learning_rate": 3.2014000384477223e-07, + "loss": 0.0219, + "reward": 1.1584821939468384, + "reward_std": 0.1813322938978672, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9687500447034836, + "step": 2355 + }, + { + "completion_length": 880.0714721679688, + "epoch": 0.7037562542005825, + "grad_norm": 1.185403823852539, + "kl": 0.52392578125, + "learning_rate": 3.197366191069199e-07, + "loss": 0.0131, + "reward": 1.0625000596046448, + "reward_std": 0.2306777983903885, + "rewards/accuracy_reward": 0.09821428963914514, + "rewards/format_reward": 0.964285746216774, + "step": 2356 + }, + { + "completion_length": 829.5067291259766, + "epoch": 0.7040549622881039, + "grad_norm": 1.166911005973816, + "kl": 0.5341796875, + "learning_rate": 3.193334848730577e-07, + "loss": 0.0321, + "reward": 1.1674107611179352, + "reward_std": 0.16565961111336946, + "rewards/accuracy_reward": 0.18080358067527413, + "rewards/format_reward": 0.9866071790456772, + "step": 2357 + }, + { + "completion_length": 859.544677734375, + "epoch": 0.7043536703756255, + "grad_norm": 1.710505485534668, + "kl": 0.68994140625, + "learning_rate": 3.1893060158175607e-07, + "loss": 0.0586, + "reward": 1.1674107313156128, + "reward_std": 0.1997287981212139, + "rewards/accuracy_reward": 0.200892873108387, + "rewards/format_reward": 0.9665178954601288, + "step": 2358 + }, + { + "completion_length": 884.6897583007812, + "epoch": 0.7046523784631469, + "grad_norm": 2.4911484718322754, + "kl": 0.6416015625, + "learning_rate": 3.185279696713129e-07, + "loss": 0.0743, + "reward": 1.176339328289032, + "reward_std": 0.23938824236392975, + "rewards/accuracy_reward": 0.2098214328289032, + "rewards/format_reward": 0.96651791036129, + "step": 2359 + }, + { + "completion_length": 813.2678985595703, + "epoch": 0.7049510865506684, + "grad_norm": 1.4855858087539673, + "kl": 0.623779296875, + "learning_rate": 3.181255895797519e-07, + "loss": 0.0483, + "reward": 1.2321429252624512, + "reward_std": 0.1944783329963684, + "rewards/accuracy_reward": 0.25669644586741924, + "rewards/format_reward": 0.975446492433548, + "step": 2360 + }, + { + "completion_length": 890.6607513427734, + "epoch": 0.7052497946381898, + "grad_norm": 1.0102028846740723, + "kl": 0.45849609375, + "learning_rate": 3.1772346174482325e-07, + "loss": 0.0294, + "reward": 1.0513393431901932, + "reward_std": 0.19586292281746864, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.9709821939468384, + "step": 2361 + }, + { + "completion_length": 886.3326416015625, + "epoch": 0.7055485027257113, + "grad_norm": 1.1362072229385376, + "kl": 0.6474609375, + "learning_rate": 3.1732158660400286e-07, + "loss": 0.0141, + "reward": 1.1383928954601288, + "reward_std": 0.16540240123867989, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.979910746216774, + "step": 2362 + }, + { + "completion_length": 797.5357513427734, + "epoch": 0.7058472108132328, + "grad_norm": 2.0281105041503906, + "kl": 0.64404296875, + "learning_rate": 3.169199645944912e-07, + "loss": 0.0524, + "reward": 1.1830357909202576, + "reward_std": 0.23730846121907234, + "rewards/accuracy_reward": 0.20982143841683865, + "rewards/format_reward": 0.973214328289032, + "step": 2363 + }, + { + "completion_length": 917.9107666015625, + "epoch": 0.7061459189007543, + "grad_norm": 2.3476288318634033, + "kl": 0.833984375, + "learning_rate": 3.1651859615321367e-07, + "loss": 0.0343, + "reward": 1.0625000596046448, + "reward_std": 0.25160402059555054, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.948660746216774, + "step": 2364 + }, + { + "completion_length": 851.1451263427734, + "epoch": 0.7064446269882757, + "grad_norm": 1.3923108577728271, + "kl": 0.626953125, + "learning_rate": 3.161174817168202e-07, + "loss": -0.0253, + "reward": 1.100446492433548, + "reward_std": 0.24124879017472267, + "rewards/accuracy_reward": 0.1406250111758709, + "rewards/format_reward": 0.9598214626312256, + "step": 2365 + }, + { + "completion_length": 841.3348541259766, + "epoch": 0.7067433350757972, + "grad_norm": 1.6481051445007324, + "kl": 0.66064453125, + "learning_rate": 3.1571662172168334e-07, + "loss": 0.0292, + "reward": 1.2433036267757416, + "reward_std": 0.23768223449587822, + "rewards/accuracy_reward": 0.2767857313156128, + "rewards/format_reward": 0.9665178954601288, + "step": 2366 + }, + { + "completion_length": 931.9888916015625, + "epoch": 0.7070420431633186, + "grad_norm": 2.2282989025115967, + "kl": 0.7685546875, + "learning_rate": 3.153160166039e-07, + "loss": 0.057, + "reward": 1.1205357611179352, + "reward_std": 0.21006687730550766, + "rewards/accuracy_reward": 0.17187500861473382, + "rewards/format_reward": 0.9486607611179352, + "step": 2367 + }, + { + "completion_length": 861.2768096923828, + "epoch": 0.7073407512508402, + "grad_norm": 0.9817290306091309, + "kl": 0.73583984375, + "learning_rate": 3.1491566679928895e-07, + "loss": 0.024, + "reward": 1.1294643580913544, + "reward_std": 0.1705084778368473, + "rewards/accuracy_reward": 0.14955358020961285, + "rewards/format_reward": 0.979910746216774, + "step": 2368 + }, + { + "completion_length": 759.7611846923828, + "epoch": 0.7076394593383616, + "grad_norm": 1.5159913301467896, + "kl": 0.931640625, + "learning_rate": 3.145155727433917e-07, + "loss": 0.0539, + "reward": 1.1540179252624512, + "reward_std": 0.24741947650909424, + "rewards/accuracy_reward": 0.1919643022119999, + "rewards/format_reward": 0.9620536267757416, + "step": 2369 + }, + { + "completion_length": 980.4442596435547, + "epoch": 0.707938167425883, + "grad_norm": 1.046412706375122, + "kl": 0.7548828125, + "learning_rate": 3.141157348714716e-07, + "loss": 0.0675, + "reward": 1.1183036267757416, + "reward_std": 0.2275369130074978, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.9687500298023224, + "step": 2370 + }, + { + "completion_length": 763.6964416503906, + "epoch": 0.7082368755134045, + "grad_norm": 0.6616463661193848, + "kl": 0.40478515625, + "learning_rate": 3.1371615361851246e-07, + "loss": 0.033, + "reward": 1.225446492433548, + "reward_std": 0.23717477917671204, + "rewards/accuracy_reward": 0.2388392984867096, + "rewards/format_reward": 0.9866071790456772, + "step": 2371 + }, + { + "completion_length": 847.6674346923828, + "epoch": 0.7085355836009259, + "grad_norm": 1.2719382047653198, + "kl": 0.8623046875, + "learning_rate": 3.1331682941922e-07, + "loss": 0.0333, + "reward": 1.1696429252624512, + "reward_std": 0.24265654385089874, + "rewards/accuracy_reward": 0.20089286379516125, + "rewards/format_reward": 0.9687500447034836, + "step": 2372 + }, + { + "completion_length": 921.2500610351562, + "epoch": 0.7088342916884475, + "grad_norm": 1.0284950733184814, + "kl": 0.70361328125, + "learning_rate": 3.129177627080198e-07, + "loss": 0.0086, + "reward": 1.0758929252624512, + "reward_std": 0.20364383608102798, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.9598214775323868, + "step": 2373 + }, + { + "completion_length": 924.716552734375, + "epoch": 0.7091329997759689, + "grad_norm": 1.2232908010482788, + "kl": 0.841796875, + "learning_rate": 3.125189539190571e-07, + "loss": 0.071, + "reward": 1.1919643580913544, + "reward_std": 0.2880982533097267, + "rewards/accuracy_reward": 0.2299107275903225, + "rewards/format_reward": 0.9620535969734192, + "step": 2374 + }, + { + "completion_length": 722.7433319091797, + "epoch": 0.7094317078634904, + "grad_norm": 1.0002504587173462, + "kl": 0.5029296875, + "learning_rate": 3.121204034861969e-07, + "loss": 0.0223, + "reward": 1.2031250298023224, + "reward_std": 0.21896571666002274, + "rewards/accuracy_reward": 0.2343750037252903, + "rewards/format_reward": 0.9687500298023224, + "step": 2375 + }, + { + "completion_length": 837.8482360839844, + "epoch": 0.7097304159510118, + "grad_norm": 2.03952693939209, + "kl": 0.60546875, + "learning_rate": 3.117221118430231e-07, + "loss": 0.0375, + "reward": 1.191964328289032, + "reward_std": 0.23159269988536835, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9687500298023224, + "step": 2376 + }, + { + "completion_length": 880.8504943847656, + "epoch": 0.7100291240385334, + "grad_norm": 1.528032898902893, + "kl": 0.59912109375, + "learning_rate": 3.1132407942283777e-07, + "loss": 0.0332, + "reward": 1.1540179252624512, + "reward_std": 0.17316418886184692, + "rewards/accuracy_reward": 0.17857143771834671, + "rewards/format_reward": 0.9754464626312256, + "step": 2377 + }, + { + "completion_length": 907.0045166015625, + "epoch": 0.7103278321260548, + "grad_norm": 0.9086123108863831, + "kl": 0.470458984375, + "learning_rate": 3.109263066586613e-07, + "loss": 0.0205, + "reward": 1.1785714328289032, + "reward_std": 0.20102538913488388, + "rewards/accuracy_reward": 0.19866072572767735, + "rewards/format_reward": 0.9799107611179352, + "step": 2378 + }, + { + "completion_length": 1003.6518249511719, + "epoch": 0.7106265402135763, + "grad_norm": 1.1878752708435059, + "kl": 0.734375, + "learning_rate": 3.105287939832316e-07, + "loss": 0.0105, + "reward": 1.1138393580913544, + "reward_std": 0.20113129913806915, + "rewards/accuracy_reward": 0.15625000488944352, + "rewards/format_reward": 0.9575893133878708, + "step": 2379 + }, + { + "completion_length": 770.0937805175781, + "epoch": 0.7109252483010977, + "grad_norm": 2.2950499057769775, + "kl": 0.74853515625, + "learning_rate": 3.1013154182900307e-07, + "loss": 0.0724, + "reward": 1.1339285969734192, + "reward_std": 0.20306216180324554, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.973214328289032, + "step": 2380 + }, + { + "completion_length": 893.3214569091797, + "epoch": 0.7112239563886192, + "grad_norm": 1.8172203302383423, + "kl": 0.70947265625, + "learning_rate": 3.0973455062814767e-07, + "loss": 0.0201, + "reward": 1.1071428954601288, + "reward_std": 0.2846030220389366, + "rewards/accuracy_reward": 0.15178571827709675, + "rewards/format_reward": 0.9553571939468384, + "step": 2381 + }, + { + "completion_length": 819.7678985595703, + "epoch": 0.7115226644761407, + "grad_norm": 1.6471861600875854, + "kl": 0.54248046875, + "learning_rate": 3.0933782081255243e-07, + "loss": 0.0138, + "reward": 1.1294642984867096, + "reward_std": 0.1675387267023325, + "rewards/accuracy_reward": 0.14732143585570157, + "rewards/format_reward": 0.9821428805589676, + "step": 2382 + }, + { + "completion_length": 933.0692443847656, + "epoch": 0.7118213725636622, + "grad_norm": 1.0991019010543823, + "kl": 0.70556640625, + "learning_rate": 3.089413528138207e-07, + "loss": 0.036, + "reward": 1.113839328289032, + "reward_std": 0.21383283101022243, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.9776786118745804, + "step": 2383 + }, + { + "completion_length": 885.6339721679688, + "epoch": 0.7121200806511836, + "grad_norm": 1.348259687423706, + "kl": 0.4912109375, + "learning_rate": 3.0854514706327105e-07, + "loss": 0.027, + "reward": 1.0669643580913544, + "reward_std": 0.2655791211873293, + "rewards/accuracy_reward": 0.10491072130389512, + "rewards/format_reward": 0.9620536118745804, + "step": 2384 + }, + { + "completion_length": 862.841552734375, + "epoch": 0.7124187887387051, + "grad_norm": 1.672980785369873, + "kl": 0.7646484375, + "learning_rate": 3.081492039919361e-07, + "loss": 0.0223, + "reward": 1.1205357611179352, + "reward_std": 0.21657218039035797, + "rewards/accuracy_reward": 0.15401786286383867, + "rewards/format_reward": 0.9665178954601288, + "step": 2385 + }, + { + "completion_length": 864.8884429931641, + "epoch": 0.7127174968262265, + "grad_norm": 1.8617854118347168, + "kl": 0.998046875, + "learning_rate": 3.077535240305632e-07, + "loss": 0.0614, + "reward": 1.1696428954601288, + "reward_std": 0.27071403712034225, + "rewards/accuracy_reward": 0.2142857275903225, + "rewards/format_reward": 0.9553571790456772, + "step": 2386 + }, + { + "completion_length": 900.0558471679688, + "epoch": 0.7130162049137481, + "grad_norm": 1.9417465925216675, + "kl": 1.22265625, + "learning_rate": 3.0735810760961367e-07, + "loss": 0.1091, + "reward": 1.0558035969734192, + "reward_std": 0.1927819326519966, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.9665178805589676, + "step": 2387 + }, + { + "completion_length": 827.0915679931641, + "epoch": 0.7133149130012695, + "grad_norm": 1.6374276876449585, + "kl": 0.77734375, + "learning_rate": 3.069629551592615e-07, + "loss": 0.0493, + "reward": 1.254464328289032, + "reward_std": 0.22836207319051027, + "rewards/accuracy_reward": 0.294642873108387, + "rewards/format_reward": 0.9598214626312256, + "step": 2388 + }, + { + "completion_length": 842.7120971679688, + "epoch": 0.713613621088791, + "grad_norm": 2.851123571395874, + "kl": 0.65771484375, + "learning_rate": 3.065680671093939e-07, + "loss": 0.0594, + "reward": 1.1741071939468384, + "reward_std": 0.20055343583226204, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.9665178954601288, + "step": 2389 + }, + { + "completion_length": 910.1674652099609, + "epoch": 0.7139123291763124, + "grad_norm": 1.2908076047897339, + "kl": 0.8017578125, + "learning_rate": 3.061734438896104e-07, + "loss": 0.0829, + "reward": 1.0825893580913544, + "reward_std": 0.1424635499715805, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.9776786118745804, + "step": 2390 + }, + { + "completion_length": 882.9799652099609, + "epoch": 0.714211037263834, + "grad_norm": 1.9015859365463257, + "kl": 0.611328125, + "learning_rate": 3.0577908592922235e-07, + "loss": 0.0121, + "reward": 1.223214328289032, + "reward_std": 0.3013296462595463, + "rewards/accuracy_reward": 0.2723214440047741, + "rewards/format_reward": 0.9508928954601288, + "step": 2391 + }, + { + "completion_length": 841.1205749511719, + "epoch": 0.7145097453513554, + "grad_norm": 4.1551127433776855, + "kl": 0.9814453125, + "learning_rate": 3.053849936572526e-07, + "loss": 0.0477, + "reward": 1.1540178954601288, + "reward_std": 0.2603931650519371, + "rewards/accuracy_reward": 0.19866072246804833, + "rewards/format_reward": 0.9553571790456772, + "step": 2392 + }, + { + "completion_length": 880.3326263427734, + "epoch": 0.7148084534388769, + "grad_norm": 1.344546914100647, + "kl": 0.58349609375, + "learning_rate": 3.0499116750243504e-07, + "loss": 0.0345, + "reward": 1.1718750298023224, + "reward_std": 0.22436023876070976, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.964285746216774, + "step": 2393 + }, + { + "completion_length": 896.5491333007812, + "epoch": 0.7151071615263983, + "grad_norm": 2.5681278705596924, + "kl": 1.14892578125, + "learning_rate": 3.0459760789321357e-07, + "loss": 0.041, + "reward": 1.0736607760190964, + "reward_std": 0.25281750597059727, + "rewards/accuracy_reward": 0.12500001047737896, + "rewards/format_reward": 0.9486607611179352, + "step": 2394 + }, + { + "completion_length": 855.1094055175781, + "epoch": 0.7154058696139198, + "grad_norm": 1.5873347520828247, + "kl": 0.7353515625, + "learning_rate": 3.0420431525774293e-07, + "loss": 0.0229, + "reward": 1.1183035969734192, + "reward_std": 0.20516245067119598, + "rewards/accuracy_reward": 0.1450892984867096, + "rewards/format_reward": 0.9732143431901932, + "step": 2395 + }, + { + "completion_length": 816.5558319091797, + "epoch": 0.7157045777014412, + "grad_norm": 6.446451663970947, + "kl": 1.07421875, + "learning_rate": 3.0381129002388653e-07, + "loss": 0.0608, + "reward": 1.2410715222358704, + "reward_std": 0.26648156344890594, + "rewards/accuracy_reward": 0.2700892947614193, + "rewards/format_reward": 0.9709821790456772, + "step": 2396 + }, + { + "completion_length": 747.3906555175781, + "epoch": 0.7160032857889628, + "grad_norm": 1.0153093338012695, + "kl": 0.8173828125, + "learning_rate": 3.0341853261921753e-07, + "loss": 0.0546, + "reward": 1.160714328289032, + "reward_std": 0.22968080081045628, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9598214626312256, + "step": 2397 + }, + { + "completion_length": 799.2567443847656, + "epoch": 0.7163019938764842, + "grad_norm": 1.1313637495040894, + "kl": 0.78125, + "learning_rate": 3.0302604347101763e-07, + "loss": 0.0725, + "reward": 1.2165179252624512, + "reward_std": 0.23680735751986504, + "rewards/accuracy_reward": 0.2410714440047741, + "rewards/format_reward": 0.9754464626312256, + "step": 2398 + }, + { + "completion_length": 823.9219055175781, + "epoch": 0.7166007019640057, + "grad_norm": 2.839247703552246, + "kl": 0.806640625, + "learning_rate": 3.0263382300627615e-07, + "loss": 0.0354, + "reward": 1.0781250596046448, + "reward_std": 0.18756618350744247, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.9575893431901932, + "step": 2399 + }, + { + "completion_length": 858.6763763427734, + "epoch": 0.7168994100515271, + "grad_norm": 1.7947309017181396, + "kl": 1.1953125, + "learning_rate": 3.022418716516908e-07, + "loss": 0.0866, + "reward": 1.2544643580913544, + "reward_std": 0.27101368457078934, + "rewards/accuracy_reward": 0.30357144214212894, + "rewards/format_reward": 0.9508928954601288, + "step": 2400 + }, + { + "completion_length": 967.4866333007812, + "epoch": 0.7171981181390487, + "grad_norm": 2.289304494857788, + "kl": 0.927734375, + "learning_rate": 3.018501898336664e-07, + "loss": 0.0205, + "reward": 1.176339328289032, + "reward_std": 0.2174098826944828, + "rewards/accuracy_reward": 0.2209821492433548, + "rewards/format_reward": 0.9553571790456772, + "step": 2401 + }, + { + "completion_length": 822.7969207763672, + "epoch": 0.7174968262265701, + "grad_norm": 1.2783464193344116, + "kl": 0.904296875, + "learning_rate": 3.0145877797831385e-07, + "loss": 0.0283, + "reward": 1.0870535969734192, + "reward_std": 0.2423391416668892, + "rewards/accuracy_reward": 0.12053572293370962, + "rewards/format_reward": 0.9665178954601288, + "step": 2402 + }, + { + "completion_length": 806.3839569091797, + "epoch": 0.7177955343140916, + "grad_norm": 1.2022521495819092, + "kl": 1.3291015625, + "learning_rate": 3.0106763651145134e-07, + "loss": 0.0635, + "reward": 1.142857164144516, + "reward_std": 0.21527925319969654, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.9598214775323868, + "step": 2403 + }, + { + "completion_length": 870.5759429931641, + "epoch": 0.718094242401613, + "grad_norm": 1.3229548931121826, + "kl": 0.9697265625, + "learning_rate": 3.006767658586024e-07, + "loss": 0.0156, + "reward": 1.0915178954601288, + "reward_std": 0.2396494708955288, + "rewards/accuracy_reward": 0.12946429289877415, + "rewards/format_reward": 0.9620536118745804, + "step": 2404 + }, + { + "completion_length": 895.6986999511719, + "epoch": 0.7183929504891345, + "grad_norm": 2.3551647663116455, + "kl": 1.197265625, + "learning_rate": 3.002861664449957e-07, + "loss": 0.0771, + "reward": 1.0959821939468384, + "reward_std": 0.24099516868591309, + "rewards/accuracy_reward": 0.14062500861473382, + "rewards/format_reward": 0.9553571790456772, + "step": 2405 + }, + { + "completion_length": 875.1317291259766, + "epoch": 0.718691658576656, + "grad_norm": 2.581920623779297, + "kl": 0.85400390625, + "learning_rate": 2.998958386955654e-07, + "loss": 0.0533, + "reward": 1.1852678954601288, + "reward_std": 0.13202796503901482, + "rewards/accuracy_reward": 0.2098214402794838, + "rewards/format_reward": 0.9754464775323868, + "step": 2406 + }, + { + "completion_length": 871.3281555175781, + "epoch": 0.7189903666641775, + "grad_norm": 2.095125675201416, + "kl": 0.712890625, + "learning_rate": 2.9950578303494976e-07, + "loss": 0.0334, + "reward": 1.0089286267757416, + "reward_std": 0.2338748835027218, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.9419643431901932, + "step": 2407 + }, + { + "completion_length": 850.4397735595703, + "epoch": 0.7192890747516989, + "grad_norm": 2.8409101963043213, + "kl": 0.52783203125, + "learning_rate": 2.9911599988749114e-07, + "loss": 0.0447, + "reward": 1.1316964775323868, + "reward_std": 0.21916008554399014, + "rewards/accuracy_reward": 0.16071429336443543, + "rewards/format_reward": 0.9709821939468384, + "step": 2408 + }, + { + "completion_length": 910.7478179931641, + "epoch": 0.7195877828392204, + "grad_norm": 0.9349902868270874, + "kl": 0.6728515625, + "learning_rate": 2.9872648967723545e-07, + "loss": -0.0039, + "reward": 1.0892857760190964, + "reward_std": 0.28698988258838654, + "rewards/accuracy_reward": 0.13392857555299997, + "rewards/format_reward": 0.9553571790456772, + "step": 2409 + }, + { + "completion_length": 847.4442443847656, + "epoch": 0.7198864909267418, + "grad_norm": 1.0676758289337158, + "kl": 0.5673828125, + "learning_rate": 2.9833725282793145e-07, + "loss": -0.004, + "reward": 1.1540179252624512, + "reward_std": 0.19908184930682182, + "rewards/accuracy_reward": 0.17410715017467737, + "rewards/format_reward": 0.9799107611179352, + "step": 2410 + }, + { + "completion_length": 859.2544860839844, + "epoch": 0.7201851990142634, + "grad_norm": 1.2028677463531494, + "kl": 0.98046875, + "learning_rate": 2.979482897630307e-07, + "loss": -0.0042, + "reward": 1.0825893580913544, + "reward_std": 0.2151030506938696, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.957589328289032, + "step": 2411 + }, + { + "completion_length": 911.7902374267578, + "epoch": 0.7204839071017848, + "grad_norm": 1.2009598016738892, + "kl": 0.8564453125, + "learning_rate": 2.975596009056871e-07, + "loss": 0.0069, + "reward": 1.1205357909202576, + "reward_std": 0.19462421722710133, + "rewards/accuracy_reward": 0.15625000791624188, + "rewards/format_reward": 0.9642857611179352, + "step": 2412 + }, + { + "completion_length": 841.9821624755859, + "epoch": 0.7207826151893062, + "grad_norm": 1.2780641317367554, + "kl": 0.57373046875, + "learning_rate": 2.9717118667875584e-07, + "loss": 0.019, + "reward": 1.0267857611179352, + "reward_std": 0.19333799555897713, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.9531250298023224, + "step": 2413 + }, + { + "completion_length": 817.6719055175781, + "epoch": 0.7210813232768277, + "grad_norm": 1.1579357385635376, + "kl": 0.6865234375, + "learning_rate": 2.9678304750479366e-07, + "loss": 0.0133, + "reward": 1.2343750298023224, + "reward_std": 0.24249719083309174, + "rewards/accuracy_reward": 0.2566964477300644, + "rewards/format_reward": 0.9776786267757416, + "step": 2414 + }, + { + "completion_length": 827.4397735595703, + "epoch": 0.7213800313643491, + "grad_norm": 1.858438491821289, + "kl": 0.7490234375, + "learning_rate": 2.9639518380605776e-07, + "loss": -0.0031, + "reward": 1.178571492433548, + "reward_std": 0.21561622992157936, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9776786118745804, + "step": 2415 + }, + { + "completion_length": 872.9576110839844, + "epoch": 0.7216787394518707, + "grad_norm": 1.118154525756836, + "kl": 0.5, + "learning_rate": 2.96007596004506e-07, + "loss": 0.0188, + "reward": 1.1830357611179352, + "reward_std": 0.15080795250833035, + "rewards/accuracy_reward": 0.2098214402794838, + "rewards/format_reward": 0.9732143133878708, + "step": 2416 + }, + { + "completion_length": 893.0178833007812, + "epoch": 0.7219774475393921, + "grad_norm": 1.2127339839935303, + "kl": 0.65771484375, + "learning_rate": 2.956202845217959e-07, + "loss": 0.042, + "reward": 1.0982142984867096, + "reward_std": 0.1819814257323742, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.9598214626312256, + "step": 2417 + }, + { + "completion_length": 870.0335235595703, + "epoch": 0.7222761556269136, + "grad_norm": 2.0008606910705566, + "kl": 0.689453125, + "learning_rate": 2.952332497792842e-07, + "loss": 0.0353, + "reward": 1.1183036267757416, + "reward_std": 0.15209118835628033, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9799107611179352, + "step": 2418 + }, + { + "completion_length": 841.9576416015625, + "epoch": 0.722574863714435, + "grad_norm": 0.687412440776825, + "kl": 0.47265625, + "learning_rate": 2.94846492198027e-07, + "loss": 0.0158, + "reward": 1.1941964626312256, + "reward_std": 0.16572850570082664, + "rewards/accuracy_reward": 0.2254464291036129, + "rewards/format_reward": 0.9687500149011612, + "step": 2419 + }, + { + "completion_length": 817.0491333007812, + "epoch": 0.7228735718019565, + "grad_norm": 0.9571564197540283, + "kl": 0.4921875, + "learning_rate": 2.944600121987786e-07, + "loss": 0.0245, + "reward": 1.194196492433548, + "reward_std": 0.23606222309172153, + "rewards/accuracy_reward": 0.22098214784637094, + "rewards/format_reward": 0.9732143133878708, + "step": 2420 + }, + { + "completion_length": 821.0111999511719, + "epoch": 0.723172279889478, + "grad_norm": 1.0263603925704956, + "kl": 0.6904296875, + "learning_rate": 2.9407381020199125e-07, + "loss": 0.0273, + "reward": 1.2433036267757416, + "reward_std": 0.20819829404354095, + "rewards/accuracy_reward": 0.2723214514553547, + "rewards/format_reward": 0.9709821790456772, + "step": 2421 + }, + { + "completion_length": 930.2344055175781, + "epoch": 0.7234709879769995, + "grad_norm": 0.8013232350349426, + "kl": 0.557373046875, + "learning_rate": 2.9368788662781493e-07, + "loss": -0.0148, + "reward": 1.0401786267757416, + "reward_std": 0.24701273813843727, + "rewards/accuracy_reward": 0.08928571990691125, + "rewards/format_reward": 0.9508928954601288, + "step": 2422 + }, + { + "completion_length": 910.6339569091797, + "epoch": 0.7237696960645209, + "grad_norm": 1.021182656288147, + "kl": 0.54248046875, + "learning_rate": 2.9330224189609674e-07, + "loss": 0.0072, + "reward": 1.1004464626312256, + "reward_std": 0.1761330310255289, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9665178954601288, + "step": 2423 + }, + { + "completion_length": 950.8795013427734, + "epoch": 0.7240684041520424, + "grad_norm": 1.747532844543457, + "kl": 0.64697265625, + "learning_rate": 2.929168764263802e-07, + "loss": -0.0009, + "reward": 1.1339286267757416, + "reward_std": 0.21759052947163582, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.973214328289032, + "step": 2424 + }, + { + "completion_length": 887.0625457763672, + "epoch": 0.7243671122395638, + "grad_norm": 2.9649572372436523, + "kl": 0.51611328125, + "learning_rate": 2.9253179063790525e-07, + "loss": 0.0245, + "reward": 1.160714328289032, + "reward_std": 0.2102176770567894, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9709821790456772, + "step": 2425 + }, + { + "completion_length": 885.8214721679688, + "epoch": 0.7246658203270854, + "grad_norm": 1.1417959928512573, + "kl": 0.311767578125, + "learning_rate": 2.921469849496077e-07, + "loss": -0.0076, + "reward": 1.1651786416769028, + "reward_std": 0.17017664946615696, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9843750149011612, + "step": 2426 + }, + { + "completion_length": 788.4018249511719, + "epoch": 0.7249645284146068, + "grad_norm": 1.0590046644210815, + "kl": 0.34912109375, + "learning_rate": 2.917624597801179e-07, + "loss": -0.0023, + "reward": 1.084821492433548, + "reward_std": 0.23352307826280594, + "rewards/accuracy_reward": 0.11830358020961285, + "rewards/format_reward": 0.9665178954601288, + "step": 2427 + }, + { + "completion_length": 844.6027221679688, + "epoch": 0.7252632365021283, + "grad_norm": 1.869219183921814, + "kl": 0.47900390625, + "learning_rate": 2.913782155477622e-07, + "loss": 0.0139, + "reward": 1.1718750298023224, + "reward_std": 0.2162465713918209, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9799107611179352, + "step": 2428 + }, + { + "completion_length": 927.3795166015625, + "epoch": 0.7255619445896497, + "grad_norm": 2.130452871322632, + "kl": 0.61328125, + "learning_rate": 2.909942526705601e-07, + "loss": 0.0364, + "reward": 1.0691964626312256, + "reward_std": 0.20361926034092903, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.9598214477300644, + "step": 2429 + }, + { + "completion_length": 895.0781555175781, + "epoch": 0.7258606526771713, + "grad_norm": 1.4942139387130737, + "kl": 0.4755859375, + "learning_rate": 2.906105715662257e-07, + "loss": 0.0257, + "reward": 1.1875000596046448, + "reward_std": 0.22264358028769493, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.964285746216774, + "step": 2430 + }, + { + "completion_length": 830.6495971679688, + "epoch": 0.7261593607646927, + "grad_norm": 3.091323137283325, + "kl": 0.876220703125, + "learning_rate": 2.902271726521668e-07, + "loss": 0.0096, + "reward": 1.0446428954601288, + "reward_std": 0.2675085701048374, + "rewards/accuracy_reward": 0.09151786379516125, + "rewards/format_reward": 0.9531250298023224, + "step": 2431 + }, + { + "completion_length": 856.0379943847656, + "epoch": 0.7264580688522142, + "grad_norm": 1.0848897695541382, + "kl": 0.64404296875, + "learning_rate": 2.898440563454834e-07, + "loss": 0.0534, + "reward": 1.1562500596046448, + "reward_std": 0.21759512647986412, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9642857760190964, + "step": 2432 + }, + { + "completion_length": 856.0379791259766, + "epoch": 0.7267567769397356, + "grad_norm": 0.9687546491622925, + "kl": 0.5888671875, + "learning_rate": 2.8946122306296874e-07, + "loss": 0.0144, + "reward": 1.1361607611179352, + "reward_std": 0.22358643636107445, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.96651791036129, + "step": 2433 + }, + { + "completion_length": 816.7076263427734, + "epoch": 0.7270554850272571, + "grad_norm": 2.043839931488037, + "kl": 0.560302734375, + "learning_rate": 2.890786732211079e-07, + "loss": 0.0129, + "reward": 1.0959821939468384, + "reward_std": 0.21831445768475533, + "rewards/accuracy_reward": 0.12723214412108064, + "rewards/format_reward": 0.9687500447034836, + "step": 2434 + }, + { + "completion_length": 795.0625457763672, + "epoch": 0.7273541931147786, + "grad_norm": 1.4332971572875977, + "kl": 0.7314453125, + "learning_rate": 2.886964072360775e-07, + "loss": 0.0205, + "reward": 1.1696429252624512, + "reward_std": 0.24602896347641945, + "rewards/accuracy_reward": 0.20089286752045155, + "rewards/format_reward": 0.9687500298023224, + "step": 2435 + }, + { + "completion_length": 945.8237152099609, + "epoch": 0.7276529012023001, + "grad_norm": 5.048965930938721, + "kl": 0.553955078125, + "learning_rate": 2.883144255237454e-07, + "loss": 0.007, + "reward": 1.1339286267757416, + "reward_std": 0.21474039647728205, + "rewards/accuracy_reward": 0.14955357951112092, + "rewards/format_reward": 0.9843750298023224, + "step": 2436 + }, + { + "completion_length": 942.5290679931641, + "epoch": 0.7279516092898215, + "grad_norm": 1.029938817024231, + "kl": 0.52099609375, + "learning_rate": 2.879327284996706e-07, + "loss": -0.0017, + "reward": 1.0848214626312256, + "reward_std": 0.21844754740595818, + "rewards/accuracy_reward": 0.12723215203732252, + "rewards/format_reward": 0.957589328289032, + "step": 2437 + }, + { + "completion_length": 850.7545013427734, + "epoch": 0.728250317377343, + "grad_norm": 1.027382731437683, + "kl": 0.304443359375, + "learning_rate": 2.875513165791017e-07, + "loss": 0.0062, + "reward": 1.2053571939468384, + "reward_std": 0.15858613140881062, + "rewards/accuracy_reward": 0.20982144214212894, + "rewards/format_reward": 0.9955357313156128, + "step": 2438 + }, + { + "completion_length": 820.9152069091797, + "epoch": 0.7285490254648644, + "grad_norm": 0.7997962236404419, + "kl": 0.62548828125, + "learning_rate": 2.8717019017697774e-07, + "loss": 0.0305, + "reward": 1.1584821790456772, + "reward_std": 0.19963065348565578, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9709821790456772, + "step": 2439 + }, + { + "completion_length": 910.7969360351562, + "epoch": 0.728847733552386, + "grad_norm": 1.3000373840332031, + "kl": 0.80859375, + "learning_rate": 2.867893497079267e-07, + "loss": 0.0152, + "reward": 1.20089291036129, + "reward_std": 0.3060663193464279, + "rewards/accuracy_reward": 0.24107143888249993, + "rewards/format_reward": 0.9598214626312256, + "step": 2440 + }, + { + "completion_length": 899.5022735595703, + "epoch": 0.7291464416399074, + "grad_norm": 1.0778597593307495, + "kl": 0.859375, + "learning_rate": 2.864087955862657e-07, + "loss": 0.0322, + "reward": 1.131696492433548, + "reward_std": 0.2076122686266899, + "rewards/accuracy_reward": 0.18080357578583062, + "rewards/format_reward": 0.9508928954601288, + "step": 2441 + }, + { + "completion_length": 925.4464874267578, + "epoch": 0.7294451497274289, + "grad_norm": 1.5854331254959106, + "kl": 0.67822265625, + "learning_rate": 2.8602852822600055e-07, + "loss": 0.0153, + "reward": 1.1049107611179352, + "reward_std": 0.24066566675901413, + "rewards/accuracy_reward": 0.14732143841683865, + "rewards/format_reward": 0.9575893133878708, + "step": 2442 + }, + { + "completion_length": 897.7232513427734, + "epoch": 0.7297438578149503, + "grad_norm": 3.0235373973846436, + "kl": 0.85009765625, + "learning_rate": 2.8564854804082455e-07, + "loss": 0.0022, + "reward": 1.1450893580913544, + "reward_std": 0.26077182963490486, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.957589328289032, + "step": 2443 + }, + { + "completion_length": 825.7924499511719, + "epoch": 0.7300425659024719, + "grad_norm": 1.6063376665115356, + "kl": 0.42578125, + "learning_rate": 2.8526885544411906e-07, + "loss": 0.0453, + "reward": 1.129464328289032, + "reward_std": 0.18926150351762772, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9754464626312256, + "step": 2444 + }, + { + "completion_length": 857.7009429931641, + "epoch": 0.7303412739899933, + "grad_norm": 1.8295258283615112, + "kl": 0.5615234375, + "learning_rate": 2.8488945084895256e-07, + "loss": -0.03, + "reward": 1.0647321939468384, + "reward_std": 0.21306613832712173, + "rewards/accuracy_reward": 0.10267857508733869, + "rewards/format_reward": 0.9620535969734192, + "step": 2445 + }, + { + "completion_length": 916.8683166503906, + "epoch": 0.7306399820775148, + "grad_norm": 2.540970802307129, + "kl": 0.67138671875, + "learning_rate": 2.8451033466807976e-07, + "loss": 0.0913, + "reward": 1.0736607611179352, + "reward_std": 0.19143314845860004, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.9620536118745804, + "step": 2446 + }, + { + "completion_length": 973.8103332519531, + "epoch": 0.7309386901650362, + "grad_norm": 1.4037129878997803, + "kl": 0.57177734375, + "learning_rate": 2.8413150731394207e-07, + "loss": 0.0134, + "reward": 1.0803571939468384, + "reward_std": 0.19911177456378937, + "rewards/accuracy_reward": 0.11830357764847577, + "rewards/format_reward": 0.9620536118745804, + "step": 2447 + }, + { + "completion_length": 715.2522583007812, + "epoch": 0.7312373982525577, + "grad_norm": 1.7229807376861572, + "kl": 0.5009765625, + "learning_rate": 2.8375296919866666e-07, + "loss": -0.0121, + "reward": 1.1049107611179352, + "reward_std": 0.18178985454142094, + "rewards/accuracy_reward": 0.12723215040750802, + "rewards/format_reward": 0.9776785969734192, + "step": 2448 + }, + { + "completion_length": 799.794677734375, + "epoch": 0.7315361063400792, + "grad_norm": 2.382214307785034, + "kl": 0.58935546875, + "learning_rate": 2.8337472073406554e-07, + "loss": 0.0308, + "reward": 1.1674107909202576, + "reward_std": 0.21602165699005127, + "rewards/accuracy_reward": 0.2031250037252903, + "rewards/format_reward": 0.9642857611179352, + "step": 2449 + }, + { + "completion_length": 791.2723541259766, + "epoch": 0.7318348144276007, + "grad_norm": 1.643492579460144, + "kl": 0.59619140625, + "learning_rate": 2.829967623316362e-07, + "loss": 0.0109, + "reward": 1.1026785969734192, + "reward_std": 0.21068266779184341, + "rewards/accuracy_reward": 0.13392857927829027, + "rewards/format_reward": 0.9687500447034836, + "step": 2450 + }, + { + "completion_length": 889.060302734375, + "epoch": 0.7321335225151221, + "grad_norm": 0.6666272282600403, + "kl": 0.272216796875, + "learning_rate": 2.8261909440256053e-07, + "loss": -0.0259, + "reward": 1.1808036267757416, + "reward_std": 0.1782340295612812, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9776786118745804, + "step": 2451 + }, + { + "completion_length": 832.4777374267578, + "epoch": 0.7324322306026436, + "grad_norm": 1.7015496492385864, + "kl": 0.53271484375, + "learning_rate": 2.822417173577038e-07, + "loss": 0.0512, + "reward": 1.131696492433548, + "reward_std": 0.16289633978158236, + "rewards/accuracy_reward": 0.1629464402794838, + "rewards/format_reward": 0.9687500447034836, + "step": 2452 + }, + { + "completion_length": 861.7678985595703, + "epoch": 0.732730938690165, + "grad_norm": 0.7936902642250061, + "kl": 0.685546875, + "learning_rate": 2.818646316076156e-07, + "loss": 0.0113, + "reward": 1.1049107611179352, + "reward_std": 0.20070640370249748, + "rewards/accuracy_reward": 0.15178571874275804, + "rewards/format_reward": 0.9531250596046448, + "step": 2453 + }, + { + "completion_length": 826.6964569091797, + "epoch": 0.7330296467776866, + "grad_norm": 1.1741076707839966, + "kl": 0.48828125, + "learning_rate": 2.8148783756252803e-07, + "loss": -0.0192, + "reward": 1.1450893580913544, + "reward_std": 0.31589946895837784, + "rewards/accuracy_reward": 0.20312500558793545, + "rewards/format_reward": 0.941964328289032, + "step": 2454 + }, + { + "completion_length": 856.7611846923828, + "epoch": 0.733328354865208, + "grad_norm": 2.698915481567383, + "kl": 0.47705078125, + "learning_rate": 2.8111133563235613e-07, + "loss": 0.0261, + "reward": 1.0580357313156128, + "reward_std": 0.23040424659848213, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.9687500298023224, + "step": 2455 + }, + { + "completion_length": 872.0625457763672, + "epoch": 0.7336270629527294, + "grad_norm": 1.037410020828247, + "kl": 0.607421875, + "learning_rate": 2.8073512622669726e-07, + "loss": 0.0004, + "reward": 1.147321492433548, + "reward_std": 0.24454094097018242, + "rewards/accuracy_reward": 0.19196429662406445, + "rewards/format_reward": 0.9553571790456772, + "step": 2456 + }, + { + "completion_length": 762.4710083007812, + "epoch": 0.7339257710402509, + "grad_norm": 0.9677467942237854, + "kl": 0.53857421875, + "learning_rate": 2.803592097548301e-07, + "loss": -0.0068, + "reward": 1.1897322237491608, + "reward_std": 0.2263961099088192, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9665178954601288, + "step": 2457 + }, + { + "completion_length": 843.0067291259766, + "epoch": 0.7342244791277723, + "grad_norm": 2.7794220447540283, + "kl": 0.716796875, + "learning_rate": 2.7998358662571513e-07, + "loss": 0.0435, + "reward": 1.1741071939468384, + "reward_std": 0.20992134511470795, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.9665178954601288, + "step": 2458 + }, + { + "completion_length": 840.982177734375, + "epoch": 0.7345231872152939, + "grad_norm": 0.9544228911399841, + "kl": 0.62158203125, + "learning_rate": 2.796082572479936e-07, + "loss": 0.0323, + "reward": 1.1540179252624512, + "reward_std": 0.2455514259636402, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/format_reward": 0.9665178954601288, + "step": 2459 + }, + { + "completion_length": 810.0156555175781, + "epoch": 0.7348218953028153, + "grad_norm": 1.4831821918487549, + "kl": 0.4990234375, + "learning_rate": 2.7923322202998685e-07, + "loss": 0.0135, + "reward": 1.191964328289032, + "reward_std": 0.2196226343512535, + "rewards/accuracy_reward": 0.22098215483129025, + "rewards/format_reward": 0.9709821790456772, + "step": 2460 + }, + { + "completion_length": 847.3750305175781, + "epoch": 0.7351206033903368, + "grad_norm": 0.8879237174987793, + "kl": 0.582275390625, + "learning_rate": 2.7885848137969643e-07, + "loss": 0.0082, + "reward": 1.2589286267757416, + "reward_std": 0.25987966544926167, + "rewards/accuracy_reward": 0.2946428693830967, + "rewards/format_reward": 0.964285746216774, + "step": 2461 + }, + { + "completion_length": 882.9553833007812, + "epoch": 0.7354193114778582, + "grad_norm": 2.000310182571411, + "kl": 0.755859375, + "learning_rate": 2.784840357048038e-07, + "loss": 0.0683, + "reward": 1.0781250596046448, + "reward_std": 0.25086651742458344, + "rewards/accuracy_reward": 0.12500000302679837, + "rewards/format_reward": 0.9531250447034836, + "step": 2462 + }, + { + "completion_length": 798.8303985595703, + "epoch": 0.7357180195653797, + "grad_norm": 1.3566539287567139, + "kl": 0.61767578125, + "learning_rate": 2.781098854126687e-07, + "loss": 0.0547, + "reward": 1.2522321939468384, + "reward_std": 0.2164020724594593, + "rewards/accuracy_reward": 0.290178582072258, + "rewards/format_reward": 0.9620536118745804, + "step": 2463 + }, + { + "completion_length": 890.1362152099609, + "epoch": 0.7360167276529012, + "grad_norm": 1.2476284503936768, + "kl": 0.65576171875, + "learning_rate": 2.777360309103301e-07, + "loss": 0.0109, + "reward": 1.026785746216774, + "reward_std": 0.1855344120413065, + "rewards/accuracy_reward": 0.06919643003493547, + "rewards/format_reward": 0.957589328289032, + "step": 2464 + }, + { + "completion_length": 854.888427734375, + "epoch": 0.7363154357404227, + "grad_norm": 1.4644416570663452, + "kl": 0.7021484375, + "learning_rate": 2.773624726045054e-07, + "loss": 0.005, + "reward": 1.0625000596046448, + "reward_std": 0.24347715079784393, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.964285746216774, + "step": 2465 + }, + { + "completion_length": 940.091552734375, + "epoch": 0.7366141438279441, + "grad_norm": 1.7655816078186035, + "kl": 0.8828125, + "learning_rate": 2.76989210901589e-07, + "loss": 0.0482, + "reward": 1.0580357611179352, + "reward_std": 0.31058255583047867, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9330357611179352, + "step": 2466 + }, + { + "completion_length": 912.6495819091797, + "epoch": 0.7369128519154656, + "grad_norm": 1.2923396825790405, + "kl": 0.9501953125, + "learning_rate": 2.7661624620765324e-07, + "loss": 0.0432, + "reward": 1.1562500596046448, + "reward_std": 0.1359657198190689, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9642857611179352, + "step": 2467 + }, + { + "completion_length": 824.5379791259766, + "epoch": 0.737211560002987, + "grad_norm": 2.1316351890563965, + "kl": 0.65625, + "learning_rate": 2.7624357892844705e-07, + "loss": 0.0412, + "reward": 1.0602678805589676, + "reward_std": 0.2147372029721737, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.964285746216774, + "step": 2468 + }, + { + "completion_length": 814.2009429931641, + "epoch": 0.7375102680905086, + "grad_norm": 1.702445149421692, + "kl": 0.724609375, + "learning_rate": 2.7587120946939595e-07, + "loss": 0.0181, + "reward": 1.191964328289032, + "reward_std": 0.2681686691939831, + "rewards/accuracy_reward": 0.21875001350417733, + "rewards/format_reward": 0.973214328289032, + "step": 2469 + }, + { + "completion_length": 799.3504791259766, + "epoch": 0.73780897617803, + "grad_norm": 2.734940767288208, + "kl": 0.6494140625, + "learning_rate": 2.7549913823560163e-07, + "loss": 0.0418, + "reward": 1.1540178954601288, + "reward_std": 0.2155267745256424, + "rewards/accuracy_reward": 0.18750000093132257, + "rewards/format_reward": 0.9665178954601288, + "step": 2470 + }, + { + "completion_length": 828.7701263427734, + "epoch": 0.7381076842655515, + "grad_norm": 2.306779146194458, + "kl": 0.8798828125, + "learning_rate": 2.751273656318408e-07, + "loss": 0.0199, + "reward": 1.0959821939468384, + "reward_std": 0.2492792923003435, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.9598214775323868, + "step": 2471 + }, + { + "completion_length": 824.4620971679688, + "epoch": 0.7384063923530729, + "grad_norm": 1.594081163406372, + "kl": 0.83740234375, + "learning_rate": 2.7475589206256565e-07, + "loss": 0.0666, + "reward": 1.2209821939468384, + "reward_std": 0.2408830039203167, + "rewards/accuracy_reward": 0.2656250149011612, + "rewards/format_reward": 0.9553571939468384, + "step": 2472 + }, + { + "completion_length": 819.919677734375, + "epoch": 0.7387051004405945, + "grad_norm": 1.7920342683792114, + "kl": 1.015625, + "learning_rate": 2.743847179319034e-07, + "loss": 0.0292, + "reward": 1.178571492433548, + "reward_std": 0.23869160562753677, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.9553571939468384, + "step": 2473 + }, + { + "completion_length": 880.2366333007812, + "epoch": 0.7390038085281159, + "grad_norm": 2.0751068592071533, + "kl": 0.78515625, + "learning_rate": 2.7401384364365453e-07, + "loss": 0.0111, + "reward": 1.1383929252624512, + "reward_std": 0.2265188843011856, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.957589328289032, + "step": 2474 + }, + { + "completion_length": 851.2232513427734, + "epoch": 0.7393025166156374, + "grad_norm": 4.985530376434326, + "kl": 0.7861328125, + "learning_rate": 2.7364326960129435e-07, + "loss": 0.029, + "reward": 1.0714285969734192, + "reward_std": 0.20523438975214958, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.9598214626312256, + "step": 2475 + }, + { + "completion_length": 957.9129791259766, + "epoch": 0.7396012247031588, + "grad_norm": 1.336277961730957, + "kl": 0.66259765625, + "learning_rate": 2.7327299620797107e-07, + "loss": 0.0183, + "reward": 0.9866071790456772, + "reward_std": 0.16401131637394428, + "rewards/accuracy_reward": 0.022321428870782256, + "rewards/format_reward": 0.964285746216774, + "step": 2476 + }, + { + "completion_length": 822.3415679931641, + "epoch": 0.7398999327906803, + "grad_norm": 1.0909814834594727, + "kl": 0.50048828125, + "learning_rate": 2.729030238665056e-07, + "loss": 0.0258, + "reward": 1.098214328289032, + "reward_std": 0.26120980829000473, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9709821790456772, + "step": 2477 + }, + { + "completion_length": 846.4531707763672, + "epoch": 0.7401986408782018, + "grad_norm": 2.2571356296539307, + "kl": 0.66357421875, + "learning_rate": 2.7253335297939175e-07, + "loss": 0.0067, + "reward": 1.0468750298023224, + "reward_std": 0.19795310497283936, + "rewards/accuracy_reward": 0.08705357648432255, + "rewards/format_reward": 0.9598214626312256, + "step": 2478 + }, + { + "completion_length": 884.654052734375, + "epoch": 0.7404973489657233, + "grad_norm": 2.1045637130737305, + "kl": 0.763671875, + "learning_rate": 2.7216398394879535e-07, + "loss": 0.0439, + "reward": 1.2165178954601288, + "reward_std": 0.34131404012441635, + "rewards/accuracy_reward": 0.2767857238650322, + "rewards/format_reward": 0.9397321939468384, + "step": 2479 + }, + { + "completion_length": 836.5424499511719, + "epoch": 0.7407960570532447, + "grad_norm": 4.836133003234863, + "kl": 0.7685546875, + "learning_rate": 2.7179491717655345e-07, + "loss": 0.0414, + "reward": 1.0468750298023224, + "reward_std": 0.285282377153635, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.9241071939468384, + "step": 2480 + }, + { + "completion_length": 889.8192443847656, + "epoch": 0.7410947651407662, + "grad_norm": 2.1849329471588135, + "kl": 0.66748046875, + "learning_rate": 2.714261530641747e-07, + "loss": 0.0188, + "reward": 1.2633929252624512, + "reward_std": 0.2571125738322735, + "rewards/accuracy_reward": 0.3013393022119999, + "rewards/format_reward": 0.9620536267757416, + "step": 2481 + }, + { + "completion_length": 772.1919860839844, + "epoch": 0.7413934732282876, + "grad_norm": 2.266568183898926, + "kl": 0.71875, + "learning_rate": 2.7105769201283825e-07, + "loss": 0.04, + "reward": 1.1986607611179352, + "reward_std": 0.26199547201395035, + "rewards/accuracy_reward": 0.2455357238650322, + "rewards/format_reward": 0.9531250298023224, + "step": 2482 + }, + { + "completion_length": 812.1406707763672, + "epoch": 0.7416921813158092, + "grad_norm": 1.7480154037475586, + "kl": 0.61865234375, + "learning_rate": 2.706895344233935e-07, + "loss": 0.0282, + "reward": 1.1495535969734192, + "reward_std": 0.22374620288610458, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.9687500596046448, + "step": 2483 + }, + { + "completion_length": 833.3884429931641, + "epoch": 0.7419908894033306, + "grad_norm": 7.155221462249756, + "kl": 0.8359375, + "learning_rate": 2.7032168069636003e-07, + "loss": 0.0541, + "reward": 1.0647321939468384, + "reward_std": 0.28011785075068474, + "rewards/accuracy_reward": 0.11383928917348385, + "rewards/format_reward": 0.9508928954601288, + "step": 2484 + }, + { + "completion_length": 861.0826263427734, + "epoch": 0.7422895974908521, + "grad_norm": 1.1328190565109253, + "kl": 0.63671875, + "learning_rate": 2.6995413123192647e-07, + "loss": -0.0165, + "reward": 1.0312500596046448, + "reward_std": 0.245094433426857, + "rewards/accuracy_reward": 0.07142857555299997, + "rewards/format_reward": 0.9598214626312256, + "step": 2485 + }, + { + "completion_length": 886.2924346923828, + "epoch": 0.7425883055783735, + "grad_norm": 1.4161053895950317, + "kl": 0.54345703125, + "learning_rate": 2.6958688642995064e-07, + "loss": 0.0305, + "reward": 1.1004464626312256, + "reward_std": 0.21934883296489716, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.964285746216774, + "step": 2486 + }, + { + "completion_length": 901.6629943847656, + "epoch": 0.742887013665895, + "grad_norm": 1.0254405736923218, + "kl": 0.4267578125, + "learning_rate": 2.6921994668995904e-07, + "loss": 0.0051, + "reward": 1.0491072088479996, + "reward_std": 0.15398430079221725, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.9709821790456772, + "step": 2487 + }, + { + "completion_length": 880.7187957763672, + "epoch": 0.7431857217534165, + "grad_norm": 1.686668872833252, + "kl": 0.82275390625, + "learning_rate": 2.6885331241114595e-07, + "loss": 0.0313, + "reward": 1.0223214775323868, + "reward_std": 0.21060540527105331, + "rewards/accuracy_reward": 0.06919643050059676, + "rewards/format_reward": 0.9531250447034836, + "step": 2488 + }, + { + "completion_length": 862.435302734375, + "epoch": 0.743484429840938, + "grad_norm": 1.725418210029602, + "kl": 0.9033203125, + "learning_rate": 2.684869839923737e-07, + "loss": 0.0282, + "reward": 1.1406250894069672, + "reward_std": 0.3020166829228401, + "rewards/accuracy_reward": 0.2053571566939354, + "rewards/format_reward": 0.9352678954601288, + "step": 2489 + }, + { + "completion_length": 852.4219055175781, + "epoch": 0.7437831379284594, + "grad_norm": 1.4073224067687988, + "kl": 0.703125, + "learning_rate": 2.681209618321717e-07, + "loss": 0.0211, + "reward": 1.0580357760190964, + "reward_std": 0.1578836152330041, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.9687500298023224, + "step": 2490 + }, + { + "completion_length": 922.8705596923828, + "epoch": 0.7440818460159809, + "grad_norm": 3.5374057292938232, + "kl": 0.9609375, + "learning_rate": 2.677552463287359e-07, + "loss": 0.0209, + "reward": 1.1026786267757416, + "reward_std": 0.22373071685433388, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.9665178954601288, + "step": 2491 + }, + { + "completion_length": 818.6652374267578, + "epoch": 0.7443805541035023, + "grad_norm": 6.576357841491699, + "kl": 0.9462890625, + "learning_rate": 2.6738983787992917e-07, + "loss": 0.0088, + "reward": 1.1250000447034836, + "reward_std": 0.28273914381861687, + "rewards/accuracy_reward": 0.17410714784637094, + "rewards/format_reward": 0.9508928954601288, + "step": 2492 + }, + { + "completion_length": 832.9531555175781, + "epoch": 0.7446792621910239, + "grad_norm": 8.26693344116211, + "kl": 1.15625, + "learning_rate": 2.670247368832803e-07, + "loss": 0.029, + "reward": 1.1406250596046448, + "reward_std": 0.17738661542534828, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9776786118745804, + "step": 2493 + }, + { + "completion_length": 763.0067443847656, + "epoch": 0.7449779702785453, + "grad_norm": 1.7943028211593628, + "kl": 0.9482421875, + "learning_rate": 2.666599437359829e-07, + "loss": 0.0381, + "reward": 1.1741071939468384, + "reward_std": 0.28715552017092705, + "rewards/accuracy_reward": 0.23214286752045155, + "rewards/format_reward": 0.941964328289032, + "step": 2494 + }, + { + "completion_length": 855.1830596923828, + "epoch": 0.7452766783660668, + "grad_norm": 2.197150707244873, + "kl": 0.5927734375, + "learning_rate": 2.662954588348966e-07, + "loss": 0.0187, + "reward": 1.1808036267757416, + "reward_std": 0.2080826424062252, + "rewards/accuracy_reward": 0.2008928693830967, + "rewards/format_reward": 0.9799107611179352, + "step": 2495 + }, + { + "completion_length": 821.3035888671875, + "epoch": 0.7455753864535882, + "grad_norm": 1.3974190950393677, + "kl": 0.81640625, + "learning_rate": 2.659312825765448e-07, + "loss": 0.0454, + "reward": 1.2232142984867096, + "reward_std": 0.26309841871261597, + "rewards/accuracy_reward": 0.2723214402794838, + "rewards/format_reward": 0.9508928954601288, + "step": 2496 + }, + { + "completion_length": 846.6094207763672, + "epoch": 0.7458740945411098, + "grad_norm": 1.023023009300232, + "kl": 0.587890625, + "learning_rate": 2.6556741535711593e-07, + "loss": 0.0213, + "reward": 1.113839328289032, + "reward_std": 0.19952553696930408, + "rewards/accuracy_reward": 0.13839286495931447, + "rewards/format_reward": 0.9754464775323868, + "step": 2497 + }, + { + "completion_length": 936.0245819091797, + "epoch": 0.7461728026286312, + "grad_norm": 1.0679875612258911, + "kl": 0.76904296875, + "learning_rate": 2.6520385757246196e-07, + "loss": 0.0085, + "reward": 1.0714286416769028, + "reward_std": 0.26213325187563896, + "rewards/accuracy_reward": 0.1294642947614193, + "rewards/format_reward": 0.941964328289032, + "step": 2498 + }, + { + "completion_length": 752.6741485595703, + "epoch": 0.7464715107161526, + "grad_norm": 1.1017564535140991, + "kl": 0.8291015625, + "learning_rate": 2.648406096180977e-07, + "loss": -0.0091, + "reward": 1.2142857313156128, + "reward_std": 0.22827821038663387, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.973214328289032, + "step": 2499 + }, + { + "completion_length": 828.904052734375, + "epoch": 0.7467702188036741, + "grad_norm": 1.6008009910583496, + "kl": 0.57177734375, + "learning_rate": 2.644776718892015e-07, + "loss": 0.019, + "reward": 1.1897321939468384, + "reward_std": 0.23716535419225693, + "rewards/accuracy_reward": 0.21651786798611283, + "rewards/format_reward": 0.9732143133878708, + "step": 2500 + }, + { + "completion_length": 878.669677734375, + "epoch": 0.7470689268911955, + "grad_norm": 1.7461822032928467, + "kl": 0.77880859375, + "learning_rate": 2.641150447806143e-07, + "loss": 0.0707, + "reward": 1.1316964626312256, + "reward_std": 0.2691107653081417, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.9620535969734192, + "step": 2501 + }, + { + "completion_length": 765.8594055175781, + "epoch": 0.747367634978717, + "grad_norm": 1.5163191556930542, + "kl": 0.6826171875, + "learning_rate": 2.637527286868385e-07, + "loss": 0.0352, + "reward": 1.176339328289032, + "reward_std": 0.18393199983984232, + "rewards/accuracy_reward": 0.20089287171140313, + "rewards/format_reward": 0.9754464626312256, + "step": 2502 + }, + { + "completion_length": 856.6317443847656, + "epoch": 0.7476663430662385, + "grad_norm": 1.11219322681427, + "kl": 0.56298828125, + "learning_rate": 2.6339072400203866e-07, + "loss": 0.0128, + "reward": 1.2120536267757416, + "reward_std": 0.25166914984583855, + "rewards/accuracy_reward": 0.25000001303851604, + "rewards/format_reward": 0.9620536118745804, + "step": 2503 + }, + { + "completion_length": 888.0089721679688, + "epoch": 0.74796505115376, + "grad_norm": 1.4986931085586548, + "kl": 0.5615234375, + "learning_rate": 2.630290311200405e-07, + "loss": 0.015, + "reward": 1.0357143431901932, + "reward_std": 0.18827393651008606, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.9687500447034836, + "step": 2504 + }, + { + "completion_length": 879.5357360839844, + "epoch": 0.7482637592412814, + "grad_norm": 2.4672577381134033, + "kl": 0.54150390625, + "learning_rate": 2.6266765043433013e-07, + "loss": 0.0062, + "reward": 1.1361607611179352, + "reward_std": 0.2839817199856043, + "rewards/accuracy_reward": 0.17410714668221772, + "rewards/format_reward": 0.9620536267757416, + "step": 2505 + }, + { + "completion_length": 869.7768249511719, + "epoch": 0.7485624673288029, + "grad_norm": 0.7506929039955139, + "kl": 0.578125, + "learning_rate": 2.623065823380545e-07, + "loss": 0.0061, + "reward": 1.098214328289032, + "reward_std": 0.26248597353696823, + "rewards/accuracy_reward": 0.13616071734577417, + "rewards/format_reward": 0.9620535969734192, + "step": 2506 + }, + { + "completion_length": 862.6897888183594, + "epoch": 0.7488611754163244, + "grad_norm": 1.5943236351013184, + "kl": 0.861328125, + "learning_rate": 2.6194582722402046e-07, + "loss": -0.0099, + "reward": 1.0245535969734192, + "reward_std": 0.19456500932574272, + "rewards/accuracy_reward": 0.06473214412108064, + "rewards/format_reward": 0.9598214626312256, + "step": 2507 + }, + { + "completion_length": 728.2879943847656, + "epoch": 0.7491598835038459, + "grad_norm": 1.1932452917099, + "kl": 0.52294921875, + "learning_rate": 2.61585385484694e-07, + "loss": 0.0158, + "reward": 1.1897322237491608, + "reward_std": 0.2685510627925396, + "rewards/accuracy_reward": 0.2209821566939354, + "rewards/format_reward": 0.9687500298023224, + "step": 2508 + }, + { + "completion_length": 782.388427734375, + "epoch": 0.7494585915913673, + "grad_norm": 1.970641851425171, + "kl": 0.61669921875, + "learning_rate": 2.6122525751220047e-07, + "loss": 0.0395, + "reward": 1.129464328289032, + "reward_std": 0.19726612605154514, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9687500447034836, + "step": 2509 + }, + { + "completion_length": 917.7009429931641, + "epoch": 0.7497572996788888, + "grad_norm": 0.8323880434036255, + "kl": 0.44677734375, + "learning_rate": 2.6086544369832373e-07, + "loss": -0.0139, + "reward": 1.0691964626312256, + "reward_std": 0.18487629666924477, + "rewards/accuracy_reward": 0.09151786030270159, + "rewards/format_reward": 0.9776786118745804, + "step": 2510 + }, + { + "completion_length": 810.0848541259766, + "epoch": 0.7500560077664102, + "grad_norm": 1.3470256328582764, + "kl": 0.53857421875, + "learning_rate": 2.6050594443450604e-07, + "loss": -0.0022, + "reward": 1.174107164144516, + "reward_std": 0.21652203053236008, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9732143431901932, + "step": 2511 + }, + { + "completion_length": 778.5870819091797, + "epoch": 0.7503547158539318, + "grad_norm": 1.2274564504623413, + "kl": 0.7041015625, + "learning_rate": 2.6014676011184743e-07, + "loss": 0.0094, + "reward": 1.1897321939468384, + "reward_std": 0.18776551634073257, + "rewards/accuracy_reward": 0.21428572107106447, + "rewards/format_reward": 0.9754464775323868, + "step": 2512 + }, + { + "completion_length": 801.8616638183594, + "epoch": 0.7506534239414532, + "grad_norm": 1.2181806564331055, + "kl": 0.59375, + "learning_rate": 2.5978789112110496e-07, + "loss": 0.0229, + "reward": 1.1852678954601288, + "reward_std": 0.2751913405954838, + "rewards/accuracy_reward": 0.21428572945296764, + "rewards/format_reward": 0.9709821939468384, + "step": 2513 + }, + { + "completion_length": 801.0491485595703, + "epoch": 0.7509521320289747, + "grad_norm": 1.339954137802124, + "kl": 0.96142578125, + "learning_rate": 2.5942933785269316e-07, + "loss": 0.0024, + "reward": 1.0758928656578064, + "reward_std": 0.276410985738039, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.9553571790456772, + "step": 2514 + }, + { + "completion_length": 880.0178985595703, + "epoch": 0.7512508401164961, + "grad_norm": 1.5727789402008057, + "kl": 0.50146484375, + "learning_rate": 2.5907110069668293e-07, + "loss": 0.047, + "reward": 1.162946492433548, + "reward_std": 0.25019813887774944, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.9754464775323868, + "step": 2515 + }, + { + "completion_length": 857.700927734375, + "epoch": 0.7515495482040176, + "grad_norm": 1.1183171272277832, + "kl": 0.5185546875, + "learning_rate": 2.587131800428009e-07, + "loss": 0.025, + "reward": 1.1718750298023224, + "reward_std": 0.25823430344462395, + "rewards/accuracy_reward": 0.19866072572767735, + "rewards/format_reward": 0.973214328289032, + "step": 2516 + }, + { + "completion_length": 867.2120971679688, + "epoch": 0.7518482562915391, + "grad_norm": 0.7768459320068359, + "kl": 0.61572265625, + "learning_rate": 2.5835557628042983e-07, + "loss": -0.0114, + "reward": 1.160714328289032, + "reward_std": 0.2662998363375664, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9620536118745804, + "step": 2517 + }, + { + "completion_length": 851.4531402587891, + "epoch": 0.7521469643790606, + "grad_norm": 0.5931491255760193, + "kl": 0.31298828125, + "learning_rate": 2.5799828979860764e-07, + "loss": -0.0073, + "reward": 1.1830357611179352, + "reward_std": 0.22563264518976212, + "rewards/accuracy_reward": 0.2209821566939354, + "rewards/format_reward": 0.9620536118745804, + "step": 2518 + }, + { + "completion_length": 927.4665679931641, + "epoch": 0.752445672466582, + "grad_norm": 1.1493350267410278, + "kl": 0.935546875, + "learning_rate": 2.5764132098602676e-07, + "loss": 0.0363, + "reward": 1.0803571939468384, + "reward_std": 0.2491183578968048, + "rewards/accuracy_reward": 0.13839286309666932, + "rewards/format_reward": 0.941964328289032, + "step": 2519 + }, + { + "completion_length": 942.0067291259766, + "epoch": 0.7527443805541035, + "grad_norm": 1.1813265085220337, + "kl": 0.55224609375, + "learning_rate": 2.5728467023103463e-07, + "loss": 0.0107, + "reward": 1.0625000298023224, + "reward_std": 0.17971371859312057, + "rewards/accuracy_reward": 0.10491072130389512, + "rewards/format_reward": 0.957589328289032, + "step": 2520 + }, + { + "completion_length": 842.7902221679688, + "epoch": 0.753043088641625, + "grad_norm": 2.353260040283203, + "kl": 0.537109375, + "learning_rate": 2.5692833792163195e-07, + "loss": 0.0382, + "reward": 1.0915178954601288, + "reward_std": 0.2957727685570717, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.9531250447034836, + "step": 2521 + }, + { + "completion_length": 940.1428985595703, + "epoch": 0.7533417967291465, + "grad_norm": 1.4794888496398926, + "kl": 0.67138671875, + "learning_rate": 2.565723244454734e-07, + "loss": -0.0062, + "reward": 1.1361607909202576, + "reward_std": 0.26902687922120094, + "rewards/accuracy_reward": 0.1785714328289032, + "rewards/format_reward": 0.9575893133878708, + "step": 2522 + }, + { + "completion_length": 820.3817291259766, + "epoch": 0.7536405048166679, + "grad_norm": 1.6673916578292847, + "kl": 0.63916015625, + "learning_rate": 2.5621663018986705e-07, + "loss": 0.0012, + "reward": 1.1718750596046448, + "reward_std": 0.24579596519470215, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.964285746216774, + "step": 2523 + }, + { + "completion_length": 965.7656707763672, + "epoch": 0.7539392129041894, + "grad_norm": 1.3667826652526855, + "kl": 0.4873046875, + "learning_rate": 2.558612555417731e-07, + "loss": -0.0109, + "reward": 1.1183036267757416, + "reward_std": 0.19534629955887794, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.973214328289032, + "step": 2524 + }, + { + "completion_length": 785.9375305175781, + "epoch": 0.7542379209917108, + "grad_norm": 0.523883044719696, + "kl": 0.329833984375, + "learning_rate": 2.5550620088780437e-07, + "loss": 0.0192, + "reward": 1.1986607909202576, + "reward_std": 0.2731405682861805, + "rewards/accuracy_reward": 0.2366071492433548, + "rewards/format_reward": 0.9620536118745804, + "step": 2525 + }, + { + "completion_length": 854.6585083007812, + "epoch": 0.7545366290792324, + "grad_norm": 1.0176414251327515, + "kl": 0.51611328125, + "learning_rate": 2.551514666142257e-07, + "loss": 0.0039, + "reward": 1.1026786267757416, + "reward_std": 0.2527660168707371, + "rewards/accuracy_reward": 0.14285715017467737, + "rewards/format_reward": 0.9598214626312256, + "step": 2526 + }, + { + "completion_length": 727.3036041259766, + "epoch": 0.7548353371667538, + "grad_norm": 1.2290500402450562, + "kl": 0.431640625, + "learning_rate": 2.54797053106953e-07, + "loss": -0.0243, + "reward": 1.3705357909202576, + "reward_std": 0.22206765413284302, + "rewards/accuracy_reward": 0.3906250149011612, + "rewards/format_reward": 0.9799107611179352, + "step": 2527 + }, + { + "completion_length": 783.2232513427734, + "epoch": 0.7551340452542753, + "grad_norm": 1.5168824195861816, + "kl": 0.53759765625, + "learning_rate": 2.5444296075155347e-07, + "loss": 0.036, + "reward": 1.1629464626312256, + "reward_std": 0.21221746131777763, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9687500298023224, + "step": 2528 + }, + { + "completion_length": 896.8058471679688, + "epoch": 0.7554327533417967, + "grad_norm": 1.2099021673202515, + "kl": 0.6318359375, + "learning_rate": 2.540891899332451e-07, + "loss": 0.0, + "reward": 1.1205357313156128, + "reward_std": 0.21388444863259792, + "rewards/accuracy_reward": 0.16071428847499192, + "rewards/format_reward": 0.9598214775323868, + "step": 2529 + }, + { + "completion_length": 913.560302734375, + "epoch": 0.7557314614293182, + "grad_norm": 1.719504952430725, + "kl": 0.748046875, + "learning_rate": 2.5373574103689565e-07, + "loss": 0.0125, + "reward": 1.131696492433548, + "reward_std": 0.2642142288386822, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.9598214626312256, + "step": 2530 + }, + { + "completion_length": 870.9129943847656, + "epoch": 0.7560301695168397, + "grad_norm": 1.31370210647583, + "kl": 0.703125, + "learning_rate": 2.5338261444702287e-07, + "loss": 0.0514, + "reward": 1.0892857611179352, + "reward_std": 0.24419860914349556, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.9598214477300644, + "step": 2531 + }, + { + "completion_length": 807.4442291259766, + "epoch": 0.7563288776043612, + "grad_norm": 1.4300121068954468, + "kl": 0.775390625, + "learning_rate": 2.5302981054779403e-07, + "loss": 0.0138, + "reward": 1.0781250298023224, + "reward_std": 0.2924589775502682, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9441964775323868, + "step": 2532 + }, + { + "completion_length": 844.4933319091797, + "epoch": 0.7566275856918826, + "grad_norm": 1.8157633543014526, + "kl": 0.55810546875, + "learning_rate": 2.52677329723025e-07, + "loss": 0.0447, + "reward": 1.2209821939468384, + "reward_std": 0.20019536092877388, + "rewards/accuracy_reward": 0.2366071566939354, + "rewards/format_reward": 0.9843750596046448, + "step": 2533 + }, + { + "completion_length": 869.8214721679688, + "epoch": 0.7569262937794041, + "grad_norm": 1.1565765142440796, + "kl": 0.4755859375, + "learning_rate": 2.523251723561807e-07, + "loss": -0.0058, + "reward": 1.158482164144516, + "reward_std": 0.23942400887608528, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9754464775323868, + "step": 2534 + }, + { + "completion_length": 810.6562957763672, + "epoch": 0.7572250018669255, + "grad_norm": 1.2526073455810547, + "kl": 0.93603515625, + "learning_rate": 2.519733388303734e-07, + "loss": 0.0547, + "reward": 1.1607143431901932, + "reward_std": 0.27251068875193596, + "rewards/accuracy_reward": 0.2142857238650322, + "rewards/format_reward": 0.9464286118745804, + "step": 2535 + }, + { + "completion_length": 861.1562957763672, + "epoch": 0.7575237099544471, + "grad_norm": 0.9785649180412292, + "kl": 0.7958984375, + "learning_rate": 2.516218295283637e-07, + "loss": 0.0123, + "reward": 1.142857164144516, + "reward_std": 0.2743324935436249, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9598214626312256, + "step": 2536 + }, + { + "completion_length": 837.763427734375, + "epoch": 0.7578224180419685, + "grad_norm": 1.539530873298645, + "kl": 1.0498046875, + "learning_rate": 2.512706448325594e-07, + "loss": 0.048, + "reward": 0.9821428954601288, + "reward_std": 0.25203607231378555, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.9308036118745804, + "step": 2537 + }, + { + "completion_length": 781.9486999511719, + "epoch": 0.75812112612949, + "grad_norm": 1.0524053573608398, + "kl": 0.80078125, + "learning_rate": 2.509197851250148e-07, + "loss": 0.0189, + "reward": 1.2165179252624512, + "reward_std": 0.2732055149972439, + "rewards/accuracy_reward": 0.2566964402794838, + "rewards/format_reward": 0.9598214626312256, + "step": 2538 + }, + { + "completion_length": 896.5223693847656, + "epoch": 0.7584198342170114, + "grad_norm": 1.6766787767410278, + "kl": 1.1435546875, + "learning_rate": 2.505692507874309e-07, + "loss": 0.0325, + "reward": 1.1250000298023224, + "reward_std": 0.26890403404831886, + "rewards/accuracy_reward": 0.17410715389996767, + "rewards/format_reward": 0.9508928954601288, + "step": 2539 + }, + { + "completion_length": 911.6406707763672, + "epoch": 0.758718542304533, + "grad_norm": 1.8702709674835205, + "kl": 1.0791015625, + "learning_rate": 2.5021904220115496e-07, + "loss": 0.0179, + "reward": 1.0892857611179352, + "reward_std": 0.32446394115686417, + "rewards/accuracy_reward": 0.15401786309666932, + "rewards/format_reward": 0.9352678954601288, + "step": 2540 + }, + { + "completion_length": 985.3995971679688, + "epoch": 0.7590172503920544, + "grad_norm": 1.7876965999603271, + "kl": 1.0205078125, + "learning_rate": 2.4986915974717927e-07, + "loss": 0.0146, + "reward": 1.069196492433548, + "reward_std": 0.23588618263602257, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.9464286118745804, + "step": 2541 + }, + { + "completion_length": 805.6875457763672, + "epoch": 0.7593159584795758, + "grad_norm": 1.532548427581787, + "kl": 0.9697265625, + "learning_rate": 2.495196038061418e-07, + "loss": 0.0429, + "reward": 1.158482164144516, + "reward_std": 0.2715131305158138, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.9486607611179352, + "step": 2542 + }, + { + "completion_length": 915.3236999511719, + "epoch": 0.7596146665670973, + "grad_norm": 0.8239653706550598, + "kl": 0.74658203125, + "learning_rate": 2.491703747583253e-07, + "loss": -0.0107, + "reward": 1.0602679252624512, + "reward_std": 0.23464959114789963, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.9553571939468384, + "step": 2543 + }, + { + "completion_length": 882.1294860839844, + "epoch": 0.7599133746546187, + "grad_norm": 1.5669035911560059, + "kl": 0.9150390625, + "learning_rate": 2.4882147298365636e-07, + "loss": 0.036, + "reward": 1.035714328289032, + "reward_std": 0.23806514218449593, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.9508928954601288, + "step": 2544 + }, + { + "completion_length": 894.8281402587891, + "epoch": 0.7602120827421402, + "grad_norm": 1.2428117990493774, + "kl": 1.099609375, + "learning_rate": 2.484728988617063e-07, + "loss": 0.0108, + "reward": 1.102678656578064, + "reward_std": 0.264876414090395, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.948660746216774, + "step": 2545 + }, + { + "completion_length": 904.5268249511719, + "epoch": 0.7605107908296617, + "grad_norm": 1.9329499006271362, + "kl": 0.640625, + "learning_rate": 2.481246527716895e-07, + "loss": 0.0399, + "reward": 1.162946492433548, + "reward_std": 0.18618295807391405, + "rewards/accuracy_reward": 0.17633929708972573, + "rewards/format_reward": 0.9866071790456772, + "step": 2546 + }, + { + "completion_length": 902.4955749511719, + "epoch": 0.7608094989171832, + "grad_norm": 1.1132560968399048, + "kl": 0.6396484375, + "learning_rate": 2.477767350924633e-07, + "loss": 0.0192, + "reward": 1.1607143580913544, + "reward_std": 0.20927909202873707, + "rewards/accuracy_reward": 0.20312500488944352, + "rewards/format_reward": 0.957589328289032, + "step": 2547 + }, + { + "completion_length": 878.2165679931641, + "epoch": 0.7611082070047046, + "grad_norm": 1.1647648811340332, + "kl": 0.580078125, + "learning_rate": 2.474291462025285e-07, + "loss": -0.014, + "reward": 1.118303656578064, + "reward_std": 0.21874134242534637, + "rewards/accuracy_reward": 0.1696428693830967, + "rewards/format_reward": 0.948660746216774, + "step": 2548 + }, + { + "completion_length": 772.7656555175781, + "epoch": 0.7614069150922261, + "grad_norm": 1.739169716835022, + "kl": 0.68603515625, + "learning_rate": 2.4708188648002736e-07, + "loss": -0.0004, + "reward": 1.2299107611179352, + "reward_std": 0.34440353512763977, + "rewards/accuracy_reward": 0.2857143059372902, + "rewards/format_reward": 0.9441964626312256, + "step": 2549 + }, + { + "completion_length": 837.8616485595703, + "epoch": 0.7617056231797475, + "grad_norm": 0.9283076524734497, + "kl": 0.5185546875, + "learning_rate": 2.467349563027445e-07, + "loss": 0.0099, + "reward": 1.0513393133878708, + "reward_std": 0.17683115229010582, + "rewards/accuracy_reward": 0.08258928940631449, + "rewards/format_reward": 0.9687500447034836, + "step": 2550 + }, + { + "completion_length": 846.9531555175781, + "epoch": 0.7620043312672691, + "grad_norm": 0.7925551533699036, + "kl": 0.76953125, + "learning_rate": 2.463883560481062e-07, + "loss": -0.0037, + "reward": 1.1272321939468384, + "reward_std": 0.21339723095297813, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.9709821790456772, + "step": 2551 + }, + { + "completion_length": 859.9531707763672, + "epoch": 0.7623030393547905, + "grad_norm": 0.9242029190063477, + "kl": 0.59033203125, + "learning_rate": 2.4604208609317923e-07, + "loss": -0.0026, + "reward": 1.1562500596046448, + "reward_std": 0.30753491446375847, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.9709821939468384, + "step": 2552 + }, + { + "completion_length": 980.6496124267578, + "epoch": 0.762601747442312, + "grad_norm": 1.3087278604507446, + "kl": 0.6259765625, + "learning_rate": 2.4569614681467156e-07, + "loss": 0.0317, + "reward": 1.0892857611179352, + "reward_std": 0.24566367641091347, + "rewards/accuracy_reward": 0.13616072479635477, + "rewards/format_reward": 0.9531250298023224, + "step": 2553 + }, + { + "completion_length": 882.5558624267578, + "epoch": 0.7629004555298334, + "grad_norm": 1.1638908386230469, + "kl": 0.546875, + "learning_rate": 2.4535053858893126e-07, + "loss": -0.0116, + "reward": 1.0982143580913544, + "reward_std": 0.234484750777483, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.9598214626312256, + "step": 2554 + }, + { + "completion_length": 875.1116485595703, + "epoch": 0.763199163617355, + "grad_norm": 1.971235752105713, + "kl": 1.0205078125, + "learning_rate": 2.45005261791946e-07, + "loss": 0.0143, + "reward": 1.1339285969734192, + "reward_std": 0.2698640748858452, + "rewards/accuracy_reward": 0.20089286426082253, + "rewards/format_reward": 0.9330357611179352, + "step": 2555 + }, + { + "completion_length": 879.1317291259766, + "epoch": 0.7634978717048764, + "grad_norm": 3.3778505325317383, + "kl": 0.63818359375, + "learning_rate": 2.4466031679934314e-07, + "loss": -0.0076, + "reward": 1.145089328289032, + "reward_std": 0.28291890025138855, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9598214775323868, + "step": 2556 + }, + { + "completion_length": 943.9464721679688, + "epoch": 0.7637965797923979, + "grad_norm": 1.0494111776351929, + "kl": 0.595703125, + "learning_rate": 2.443157039863894e-07, + "loss": 0.0073, + "reward": 1.084821492433548, + "reward_std": 0.2511305585503578, + "rewards/accuracy_reward": 0.12946429289877415, + "rewards/format_reward": 0.9553571790456772, + "step": 2557 + }, + { + "completion_length": 872.1027221679688, + "epoch": 0.7640952878799193, + "grad_norm": 1.4402724504470825, + "kl": 0.79736328125, + "learning_rate": 2.4397142372798914e-07, + "loss": -0.0237, + "reward": 1.147321492433548, + "reward_std": 0.2743900679051876, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9531250447034836, + "step": 2558 + }, + { + "completion_length": 903.3638916015625, + "epoch": 0.7643939959674408, + "grad_norm": 0.723318874835968, + "kl": 0.69677734375, + "learning_rate": 2.4362747639868594e-07, + "loss": -0.0016, + "reward": 1.0803571939468384, + "reward_std": 0.26250240951776505, + "rewards/accuracy_reward": 0.12946429220028222, + "rewards/format_reward": 0.9508928954601288, + "step": 2559 + }, + { + "completion_length": 919.3527221679688, + "epoch": 0.7646927040549623, + "grad_norm": 1.2730404138565063, + "kl": 0.7509765625, + "learning_rate": 2.4328386237266075e-07, + "loss": 0.0311, + "reward": 1.2812500596046448, + "reward_std": 0.25748884305357933, + "rewards/accuracy_reward": 0.3147321566939354, + "rewards/format_reward": 0.9665178954601288, + "step": 2560 + }, + { + "completion_length": 907.3638763427734, + "epoch": 0.7649914121424838, + "grad_norm": 2.0215818881988525, + "kl": 0.81884765625, + "learning_rate": 2.429405820237318e-07, + "loss": -0.0003, + "reward": 1.1004464775323868, + "reward_std": 0.18519255612045527, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.9709821790456772, + "step": 2561 + }, + { + "completion_length": 921.8460083007812, + "epoch": 0.7652901202300052, + "grad_norm": 0.8854357004165649, + "kl": 0.44775390625, + "learning_rate": 2.4259763572535466e-07, + "loss": 0.0031, + "reward": 1.1808035969734192, + "reward_std": 0.17917072214186192, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9821428954601288, + "step": 2562 + }, + { + "completion_length": 970.5201110839844, + "epoch": 0.7655888283175267, + "grad_norm": 0.8404673337936401, + "kl": 0.4521484375, + "learning_rate": 2.422550238506211e-07, + "loss": 0.0256, + "reward": 1.1339286267757416, + "reward_std": 0.19857870042324066, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9776786267757416, + "step": 2563 + }, + { + "completion_length": 821.8705749511719, + "epoch": 0.7658875364050481, + "grad_norm": 3.2328641414642334, + "kl": 1.349609375, + "learning_rate": 2.4191274677225924e-07, + "loss": 0.0663, + "reward": 1.073660746216774, + "reward_std": 0.3003483787178993, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9397321790456772, + "step": 2564 + }, + { + "completion_length": 849.1518249511719, + "epoch": 0.7661862444925697, + "grad_norm": 1.8886301517486572, + "kl": 1.0263671875, + "learning_rate": 2.415708048626333e-07, + "loss": 0.0333, + "reward": 1.0669643133878708, + "reward_std": 0.2661164067685604, + "rewards/accuracy_reward": 0.12276786030270159, + "rewards/format_reward": 0.9441964626312256, + "step": 2565 + }, + { + "completion_length": 935.5870971679688, + "epoch": 0.7664849525800911, + "grad_norm": 1.4039510488510132, + "kl": 0.771484375, + "learning_rate": 2.4122919849374223e-07, + "loss": 0.0467, + "reward": 1.0937500596046448, + "reward_std": 0.2160579226911068, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9598214626312256, + "step": 2566 + }, + { + "completion_length": 860.5759429931641, + "epoch": 0.7667836606676126, + "grad_norm": 1.722290277481079, + "kl": 0.779296875, + "learning_rate": 2.4088792803722036e-07, + "loss": 0.054, + "reward": 1.1651786267757416, + "reward_std": 0.2506945803761482, + "rewards/accuracy_reward": 0.2053571492433548, + "rewards/format_reward": 0.9598214477300644, + "step": 2567 + }, + { + "completion_length": 892.404052734375, + "epoch": 0.767082368755134, + "grad_norm": 1.186975121498108, + "kl": 0.5830078125, + "learning_rate": 2.4054699386433674e-07, + "loss": 0.0606, + "reward": 1.0602678954601288, + "reward_std": 0.1669196765869856, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.9598214626312256, + "step": 2568 + }, + { + "completion_length": 890.8861999511719, + "epoch": 0.7673810768426556, + "grad_norm": 2.3617427349090576, + "kl": 0.611328125, + "learning_rate": 2.40206396345994e-07, + "loss": 0.0419, + "reward": 1.2008928656578064, + "reward_std": 0.23680797219276428, + "rewards/accuracy_reward": 0.2366071604192257, + "rewards/format_reward": 0.9642857611179352, + "step": 2569 + }, + { + "completion_length": 860.0870971679688, + "epoch": 0.767679784930177, + "grad_norm": 1.7722923755645752, + "kl": 0.59130859375, + "learning_rate": 2.398661358527289e-07, + "loss": 0.0303, + "reward": 1.1026785969734192, + "reward_std": 0.2276659607887268, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9553571790456772, + "step": 2570 + }, + { + "completion_length": 825.6473693847656, + "epoch": 0.7679784930176985, + "grad_norm": 1.360585331916809, + "kl": 0.59765625, + "learning_rate": 2.3952621275471186e-07, + "loss": 0.0069, + "reward": 1.1339285969734192, + "reward_std": 0.23303445801138878, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9665178805589676, + "step": 2571 + }, + { + "completion_length": 945.5736999511719, + "epoch": 0.7682772011052199, + "grad_norm": 1.0261090993881226, + "kl": 0.490966796875, + "learning_rate": 2.391866274217455e-07, + "loss": 0.0176, + "reward": 1.0937500298023224, + "reward_std": 0.17992744594812393, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.9709821939468384, + "step": 2572 + }, + { + "completion_length": 977.3572082519531, + "epoch": 0.7685759091927414, + "grad_norm": 1.5227019786834717, + "kl": 0.9609375, + "learning_rate": 2.3884738022326547e-07, + "loss": 0.0386, + "reward": 1.0200892984867096, + "reward_std": 0.22243746742606163, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.9508928954601288, + "step": 2573 + }, + { + "completion_length": 941.0112152099609, + "epoch": 0.7688746172802629, + "grad_norm": 1.7807477712631226, + "kl": 0.673828125, + "learning_rate": 2.3850847152833965e-07, + "loss": 0.0387, + "reward": 1.0959822088479996, + "reward_std": 0.18795089051127434, + "rewards/accuracy_reward": 0.13616071734577417, + "rewards/format_reward": 0.9598214775323868, + "step": 2574 + }, + { + "completion_length": 927.4553833007812, + "epoch": 0.7691733253677844, + "grad_norm": 1.345863938331604, + "kl": 0.44921875, + "learning_rate": 2.3816990170566723e-07, + "loss": 0.0244, + "reward": 1.1517857611179352, + "reward_std": 0.21797207929193974, + "rewards/accuracy_reward": 0.1897321566939354, + "rewards/format_reward": 0.9620536267757416, + "step": 2575 + }, + { + "completion_length": 880.5290679931641, + "epoch": 0.7694720334553058, + "grad_norm": 1.7758679389953613, + "kl": 1.2265625, + "learning_rate": 2.378316711235793e-07, + "loss": 0.0585, + "reward": 1.058035746216774, + "reward_std": 0.2503250688314438, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.9441964775323868, + "step": 2576 + }, + { + "completion_length": 844.6495819091797, + "epoch": 0.7697707415428273, + "grad_norm": 1.5388907194137573, + "kl": 0.52392578125, + "learning_rate": 2.3749378015003724e-07, + "loss": -0.0193, + "reward": 1.1919643580913544, + "reward_std": 0.21248434111475945, + "rewards/accuracy_reward": 0.21651786309666932, + "rewards/format_reward": 0.9754464477300644, + "step": 2577 + }, + { + "completion_length": 818.0960235595703, + "epoch": 0.7700694496303487, + "grad_norm": 4.985808372497559, + "kl": 0.575439453125, + "learning_rate": 2.3715622915263348e-07, + "loss": 0.0398, + "reward": 1.1718750596046448, + "reward_std": 0.20735778659582138, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9687500447034836, + "step": 2578 + }, + { + "completion_length": 993.0156555175781, + "epoch": 0.7703681577178703, + "grad_norm": 1.412985920906067, + "kl": 0.6826171875, + "learning_rate": 2.3681901849859052e-07, + "loss": 0.0206, + "reward": 1.1093750596046448, + "reward_std": 0.24632856249809265, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9620536118745804, + "step": 2579 + }, + { + "completion_length": 867.5178833007812, + "epoch": 0.7706668658053917, + "grad_norm": 0.9205703735351562, + "kl": 0.507568359375, + "learning_rate": 2.3648214855476028e-07, + "loss": 0.0391, + "reward": 1.0982143133878708, + "reward_std": 0.19620557501912117, + "rewards/accuracy_reward": 0.12053572316654027, + "rewards/format_reward": 0.9776786118745804, + "step": 2580 + }, + { + "completion_length": 844.7210235595703, + "epoch": 0.7709655738929132, + "grad_norm": 1.6317561864852905, + "kl": 0.60302734375, + "learning_rate": 2.361456196876244e-07, + "loss": 0.0233, + "reward": 1.1897321939468384, + "reward_std": 0.22083741426467896, + "rewards/accuracy_reward": 0.2209821566939354, + "rewards/format_reward": 0.9687500298023224, + "step": 2581 + }, + { + "completion_length": 880.2187957763672, + "epoch": 0.7712642819804346, + "grad_norm": 1.92690110206604, + "kl": 0.62890625, + "learning_rate": 2.3580943226329333e-07, + "loss": 0.0067, + "reward": 1.1517857909202576, + "reward_std": 0.19183280877768993, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.973214328289032, + "step": 2582 + }, + { + "completion_length": 850.3527069091797, + "epoch": 0.7715629900679561, + "grad_norm": 2.1038966178894043, + "kl": 0.63671875, + "learning_rate": 2.3547358664750588e-07, + "loss": 0.0367, + "reward": 1.0714285969734192, + "reward_std": 0.21219471655786037, + "rewards/accuracy_reward": 0.10491072060540318, + "rewards/format_reward": 0.9665178954601288, + "step": 2583 + }, + { + "completion_length": 863.0268096923828, + "epoch": 0.7718616981554776, + "grad_norm": 1.1418465375900269, + "kl": 0.79736328125, + "learning_rate": 2.3513808320562925e-07, + "loss": 0.033, + "reward": 1.1651786267757416, + "reward_std": 0.2044534832239151, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.9776786118745804, + "step": 2584 + }, + { + "completion_length": 833.9174346923828, + "epoch": 0.772160406242999, + "grad_norm": 1.285447120666504, + "kl": 0.5693359375, + "learning_rate": 2.3480292230265847e-07, + "loss": 0.0421, + "reward": 1.2031250596046448, + "reward_std": 0.23641052097082138, + "rewards/accuracy_reward": 0.23214286752045155, + "rewards/format_reward": 0.9709821790456772, + "step": 2585 + }, + { + "completion_length": 910.1652221679688, + "epoch": 0.7724591143305205, + "grad_norm": 0.8913881182670593, + "kl": 0.556640625, + "learning_rate": 2.3446810430321544e-07, + "loss": 0.0015, + "reward": 1.1495536267757416, + "reward_std": 0.23420628905296326, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9642857611179352, + "step": 2586 + }, + { + "completion_length": 911.5424346923828, + "epoch": 0.7727578224180419, + "grad_norm": 0.806952714920044, + "kl": 0.5595703125, + "learning_rate": 2.341336295715494e-07, + "loss": -0.0089, + "reward": 1.1049107611179352, + "reward_std": 0.1875635739415884, + "rewards/accuracy_reward": 0.12946428847499192, + "rewards/format_reward": 0.9754464775323868, + "step": 2587 + }, + { + "completion_length": 838.0870971679688, + "epoch": 0.7730565305055634, + "grad_norm": 1.4882357120513916, + "kl": 0.73193359375, + "learning_rate": 2.337994984715364e-07, + "loss": 0.0318, + "reward": 1.1294643580913544, + "reward_std": 0.19000988453626633, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.96651791036129, + "step": 2588 + }, + { + "completion_length": 802.4911041259766, + "epoch": 0.7733552385930849, + "grad_norm": 0.8286867737770081, + "kl": 0.633544921875, + "learning_rate": 2.334657113666779e-07, + "loss": 0.0093, + "reward": 1.1607143133878708, + "reward_std": 0.13512345030903816, + "rewards/accuracy_reward": 0.1919642984867096, + "rewards/format_reward": 0.9687500447034836, + "step": 2589 + }, + { + "completion_length": 828.0982513427734, + "epoch": 0.7736539466806064, + "grad_norm": 1.0127052068710327, + "kl": 0.705078125, + "learning_rate": 2.3313226862010188e-07, + "loss": 0.0339, + "reward": 1.084821492433548, + "reward_std": 0.2426718920469284, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.964285746216774, + "step": 2590 + }, + { + "completion_length": 906.4687957763672, + "epoch": 0.7739526547681278, + "grad_norm": 1.4943336248397827, + "kl": 0.72705078125, + "learning_rate": 2.3279917059456112e-07, + "loss": 0.0475, + "reward": 1.1272321939468384, + "reward_std": 0.19828000664710999, + "rewards/accuracy_reward": 0.16071429196745157, + "rewards/format_reward": 0.9665178954601288, + "step": 2591 + }, + { + "completion_length": 817.8036041259766, + "epoch": 0.7742513628556493, + "grad_norm": 2.1232104301452637, + "kl": 0.5390625, + "learning_rate": 2.3246641765243368e-07, + "loss": -0.006, + "reward": 1.1160714626312256, + "reward_std": 0.26947516202926636, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.95089291036129, + "step": 2592 + }, + { + "completion_length": 816.2768402099609, + "epoch": 0.7745500709431707, + "grad_norm": 1.1876544952392578, + "kl": 0.6943359375, + "learning_rate": 2.321340101557224e-07, + "loss": 0.0123, + "reward": 1.1651786267757416, + "reward_std": 0.24335301667451859, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.964285746216774, + "step": 2593 + }, + { + "completion_length": 940.5491333007812, + "epoch": 0.7748487790306923, + "grad_norm": 1.4764221906661987, + "kl": 0.70947265625, + "learning_rate": 2.3180194846605364e-07, + "loss": 0.0332, + "reward": 1.1383928656578064, + "reward_std": 0.24796271324157715, + "rewards/accuracy_reward": 0.17187500861473382, + "rewards/format_reward": 0.96651791036129, + "step": 2594 + }, + { + "completion_length": 875.7388916015625, + "epoch": 0.7751474871182137, + "grad_norm": 1.1029711961746216, + "kl": 0.60888671875, + "learning_rate": 2.314702329446782e-07, + "loss": 0.0381, + "reward": 1.1071429252624512, + "reward_std": 0.2366618625819683, + "rewards/accuracy_reward": 0.13839286752045155, + "rewards/format_reward": 0.9687500298023224, + "step": 2595 + }, + { + "completion_length": 826.0379791259766, + "epoch": 0.7754461952057352, + "grad_norm": 3.553994655609131, + "kl": 1.2138671875, + "learning_rate": 2.311388639524702e-07, + "loss": 0.0525, + "reward": 1.1227678954601288, + "reward_std": 0.2670012526214123, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.9553571939468384, + "step": 2596 + }, + { + "completion_length": 821.2611846923828, + "epoch": 0.7757449032932566, + "grad_norm": 2.1598012447357178, + "kl": 0.64453125, + "learning_rate": 2.3080784184992635e-07, + "loss": 0.0655, + "reward": 1.1093750596046448, + "reward_std": 0.2403372824192047, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.9553571790456772, + "step": 2597 + }, + { + "completion_length": 941.0156860351562, + "epoch": 0.7760436113807782, + "grad_norm": 1.0519843101501465, + "kl": 0.6376953125, + "learning_rate": 2.3047716699716636e-07, + "loss": 0.0565, + "reward": 1.176339328289032, + "reward_std": 0.24825343117117882, + "rewards/accuracy_reward": 0.21651787124574184, + "rewards/format_reward": 0.9598214626312256, + "step": 2598 + }, + { + "completion_length": 932.6652374267578, + "epoch": 0.7763423194682996, + "grad_norm": 0.7757295370101929, + "kl": 0.5107421875, + "learning_rate": 2.3014683975393222e-07, + "loss": 0.0293, + "reward": 1.087053656578064, + "reward_std": 0.18611549213528633, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.9776786267757416, + "step": 2599 + }, + { + "completion_length": 953.4062957763672, + "epoch": 0.7766410275558211, + "grad_norm": 1.044270396232605, + "kl": 0.46142578125, + "learning_rate": 2.2981686047958732e-07, + "loss": -0.0138, + "reward": 1.0915179252624512, + "reward_std": 0.22579847648739815, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 2600 + }, + { + "completion_length": 742.7366333007812, + "epoch": 0.7769397356433425, + "grad_norm": 0.8333187103271484, + "kl": 0.48486328125, + "learning_rate": 2.29487229533117e-07, + "loss": 0.0108, + "reward": 1.2566964626312256, + "reward_std": 0.2058864515274763, + "rewards/accuracy_reward": 0.2834821529686451, + "rewards/format_reward": 0.973214328289032, + "step": 2601 + }, + { + "completion_length": 899.9598693847656, + "epoch": 0.777238443730864, + "grad_norm": 0.793989896774292, + "kl": 0.47998046875, + "learning_rate": 2.2915794727312722e-07, + "loss": 0.0403, + "reward": 1.225446492433548, + "reward_std": 0.27875928208231926, + "rewards/accuracy_reward": 0.2700892984867096, + "rewards/format_reward": 0.9553572088479996, + "step": 2602 + }, + { + "completion_length": 888.8214721679688, + "epoch": 0.7775371518183855, + "grad_norm": 1.0955877304077148, + "kl": 0.42919921875, + "learning_rate": 2.2882901405784485e-07, + "loss": -0.0133, + "reward": 1.0357143133878708, + "reward_std": 0.19867666065692902, + "rewards/accuracy_reward": 0.06919643329456449, + "rewards/format_reward": 0.9665178954601288, + "step": 2603 + }, + { + "completion_length": 931.4308471679688, + "epoch": 0.777835859905907, + "grad_norm": 2.5510873794555664, + "kl": 0.740234375, + "learning_rate": 2.2850043024511724e-07, + "loss": 0.0643, + "reward": 1.1562500596046448, + "reward_std": 0.259680800139904, + "rewards/accuracy_reward": 0.18526786286383867, + "rewards/format_reward": 0.9709821939468384, + "step": 2604 + }, + { + "completion_length": 880.1138916015625, + "epoch": 0.7781345679934284, + "grad_norm": 1.1566736698150635, + "kl": 0.64453125, + "learning_rate": 2.28172196192411e-07, + "loss": 0.0428, + "reward": 1.1584821939468384, + "reward_std": 0.18912889808416367, + "rewards/accuracy_reward": 0.19866072130389512, + "rewards/format_reward": 0.9598214626312256, + "step": 2605 + }, + { + "completion_length": 806.2388610839844, + "epoch": 0.7784332760809499, + "grad_norm": 1.6083751916885376, + "kl": 0.80078125, + "learning_rate": 2.278443122568128e-07, + "loss": 0.0617, + "reward": 1.1696429252624512, + "reward_std": 0.17711742967367172, + "rewards/accuracy_reward": 0.19419644214212894, + "rewards/format_reward": 0.9754464626312256, + "step": 2606 + }, + { + "completion_length": 942.4553833007812, + "epoch": 0.7787319841684713, + "grad_norm": 0.6827284097671509, + "kl": 0.526123046875, + "learning_rate": 2.2751677879502838e-07, + "loss": 0.0105, + "reward": 1.1138393431901932, + "reward_std": 0.15936138853430748, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.9709821939468384, + "step": 2607 + }, + { + "completion_length": 933.5022735595703, + "epoch": 0.7790306922559929, + "grad_norm": 0.8456253409385681, + "kl": 0.505859375, + "learning_rate": 2.271895961633817e-07, + "loss": 0.0051, + "reward": 1.176339328289032, + "reward_std": 0.16379439458251, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9776786118745804, + "step": 2608 + }, + { + "completion_length": 929.0803985595703, + "epoch": 0.7793294003435143, + "grad_norm": 1.142263650894165, + "kl": 0.4970703125, + "learning_rate": 2.268627647178156e-07, + "loss": -0.0136, + "reward": 1.1406250596046448, + "reward_std": 0.22005050629377365, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.9642857611179352, + "step": 2609 + }, + { + "completion_length": 776.6138763427734, + "epoch": 0.7796281084310358, + "grad_norm": 1.5253119468688965, + "kl": 0.55615234375, + "learning_rate": 2.265362848138908e-07, + "loss": 0.0585, + "reward": 1.1473214626312256, + "reward_std": 0.23758064582943916, + "rewards/accuracy_reward": 0.18526786752045155, + "rewards/format_reward": 0.9620536118745804, + "step": 2610 + }, + { + "completion_length": 900.4620819091797, + "epoch": 0.7799268165185572, + "grad_norm": 1.345265507698059, + "kl": 0.493408203125, + "learning_rate": 2.262101568067851e-07, + "loss": -0.013, + "reward": 1.1026786118745804, + "reward_std": 0.25882144272327423, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9687500447034836, + "step": 2611 + }, + { + "completion_length": 885.8036193847656, + "epoch": 0.7802255246060787, + "grad_norm": 1.1428951025009155, + "kl": 0.607421875, + "learning_rate": 2.2588438105129398e-07, + "loss": 0.0038, + "reward": 1.1093750894069672, + "reward_std": 0.2657526396214962, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.957589328289032, + "step": 2612 + }, + { + "completion_length": 983.8013763427734, + "epoch": 0.7805242326936002, + "grad_norm": 0.8411867618560791, + "kl": 0.4921875, + "learning_rate": 2.2555895790182967e-07, + "loss": -0.0087, + "reward": 1.0625000298023224, + "reward_std": 0.18677044659852982, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.9687500596046448, + "step": 2613 + }, + { + "completion_length": 878.0937957763672, + "epoch": 0.7808229407811217, + "grad_norm": 1.8155972957611084, + "kl": 0.46826171875, + "learning_rate": 2.2523388771242036e-07, + "loss": 0.0432, + "reward": 1.0959821939468384, + "reward_std": 0.20870696380734444, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.9754464775323868, + "step": 2614 + }, + { + "completion_length": 944.9219360351562, + "epoch": 0.7811216488686431, + "grad_norm": 0.6874825954437256, + "kl": 0.45263671875, + "learning_rate": 2.249091708367109e-07, + "loss": 0.0396, + "reward": 1.2611607909202576, + "reward_std": 0.214049082249403, + "rewards/accuracy_reward": 0.2857142984867096, + "rewards/format_reward": 0.9754464775323868, + "step": 2615 + }, + { + "completion_length": 915.6451568603516, + "epoch": 0.7814203569561646, + "grad_norm": 1.3255534172058105, + "kl": 0.66455078125, + "learning_rate": 2.245848076279611e-07, + "loss": 0.037, + "reward": 1.069196492433548, + "reward_std": 0.2026979811489582, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.95089291036129, + "step": 2616 + }, + { + "completion_length": 909.4420013427734, + "epoch": 0.781719065043686, + "grad_norm": 1.204375982284546, + "kl": 0.5537109375, + "learning_rate": 2.2426079843904643e-07, + "loss": 0.0135, + "reward": 1.1361607611179352, + "reward_std": 0.31020326912403107, + "rewards/accuracy_reward": 0.17857143841683865, + "rewards/format_reward": 0.957589328289032, + "step": 2617 + }, + { + "completion_length": 867.1317291259766, + "epoch": 0.7820177731312076, + "grad_norm": 0.5553996562957764, + "kl": 0.5498046875, + "learning_rate": 2.2393714362245743e-07, + "loss": 0.0209, + "reward": 1.2254464626312256, + "reward_std": 0.20218930020928383, + "rewards/accuracy_reward": 0.2544642984867096, + "rewards/format_reward": 0.9709821790456772, + "step": 2618 + }, + { + "completion_length": 916.6875610351562, + "epoch": 0.782316481218729, + "grad_norm": 8.488426208496094, + "kl": 0.5728759765625, + "learning_rate": 2.2361384353029834e-07, + "loss": 0.0162, + "reward": 1.10714291036129, + "reward_std": 0.20414533838629723, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9754464626312256, + "step": 2619 + }, + { + "completion_length": 855.9420013427734, + "epoch": 0.7826151893062505, + "grad_norm": 1.556418776512146, + "kl": 0.638671875, + "learning_rate": 2.2329089851428824e-07, + "loss": 0.0106, + "reward": 1.0691964626312256, + "reward_std": 0.2296544872224331, + "rewards/accuracy_reward": 0.10044643236324191, + "rewards/format_reward": 0.9687500298023224, + "step": 2620 + }, + { + "completion_length": 868.9620971679688, + "epoch": 0.7829138973937719, + "grad_norm": 2.8399147987365723, + "kl": 0.63818359375, + "learning_rate": 2.2296830892575973e-07, + "loss": 0.052, + "reward": 1.1830357611179352, + "reward_std": 0.2015317790210247, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9598214626312256, + "step": 2621 + }, + { + "completion_length": 867.9308471679688, + "epoch": 0.7832126054812935, + "grad_norm": 1.1924971342086792, + "kl": 0.5810546875, + "learning_rate": 2.2264607511565846e-07, + "loss": 0.0585, + "reward": 1.2633928954601288, + "reward_std": 0.26722216233611107, + "rewards/accuracy_reward": 0.30133930407464504, + "rewards/format_reward": 0.9620535969734192, + "step": 2622 + }, + { + "completion_length": 950.8527221679688, + "epoch": 0.7835113135688149, + "grad_norm": 1.238160490989685, + "kl": 0.54296875, + "learning_rate": 2.2232419743454333e-07, + "loss": 0.0277, + "reward": 1.0245536118745804, + "reward_std": 0.16478463634848595, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.9709821939468384, + "step": 2623 + }, + { + "completion_length": 828.7678985595703, + "epoch": 0.7838100216563364, + "grad_norm": 0.8091662526130676, + "kl": 0.47412109375, + "learning_rate": 2.2200267623258585e-07, + "loss": 0.0184, + "reward": 1.0758928954601288, + "reward_std": 0.15428666584193707, + "rewards/accuracy_reward": 0.08928571944124997, + "rewards/format_reward": 0.9866071939468384, + "step": 2624 + }, + { + "completion_length": 816.0781555175781, + "epoch": 0.7841087297438578, + "grad_norm": 1.399854302406311, + "kl": 0.765625, + "learning_rate": 2.2168151185956929e-07, + "loss": 0.0142, + "reward": 1.10714291036129, + "reward_std": 0.16765020042657852, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9598214775323868, + "step": 2625 + }, + { + "completion_length": 860.3080596923828, + "epoch": 0.7844074378313793, + "grad_norm": 1.1182613372802734, + "kl": 0.43603515625, + "learning_rate": 2.2136070466488913e-07, + "loss": 0.0079, + "reward": 1.1339286267757416, + "reward_std": 0.20403317734599113, + "rewards/accuracy_reward": 0.16071429336443543, + "rewards/format_reward": 0.973214328289032, + "step": 2626 + }, + { + "completion_length": 926.7076263427734, + "epoch": 0.7847061459189008, + "grad_norm": 1.1350435018539429, + "kl": 0.5400390625, + "learning_rate": 2.2104025499755236e-07, + "loss": 0.0154, + "reward": 1.0870536118745804, + "reward_std": 0.25318368151783943, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.9575893133878708, + "step": 2627 + }, + { + "completion_length": 978.9531707763672, + "epoch": 0.7850048540064222, + "grad_norm": 1.091230034828186, + "kl": 0.6396484375, + "learning_rate": 2.207201632061765e-07, + "loss": 0.0244, + "reward": 1.1183035969734192, + "reward_std": 0.18777194619178772, + "rewards/accuracy_reward": 0.14955357741564512, + "rewards/format_reward": 0.9687500298023224, + "step": 2628 + }, + { + "completion_length": 953.6473693847656, + "epoch": 0.7853035620939437, + "grad_norm": 1.1930168867111206, + "kl": 0.6435546875, + "learning_rate": 2.2040042963899028e-07, + "loss": 0.0024, + "reward": 1.0089286118745804, + "reward_std": 0.17875012755393982, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.957589328289032, + "step": 2629 + }, + { + "completion_length": 840.5223693847656, + "epoch": 0.7856022701814651, + "grad_norm": 1.4141563177108765, + "kl": 0.826171875, + "learning_rate": 2.2008105464383227e-07, + "loss": 0.0456, + "reward": 1.1450893580913544, + "reward_std": 0.24676961451768875, + "rewards/accuracy_reward": 0.17857143096625805, + "rewards/format_reward": 0.9665178954601288, + "step": 2630 + }, + { + "completion_length": 866.4129943847656, + "epoch": 0.7859009782689866, + "grad_norm": 0.6949387788772583, + "kl": 0.6865234375, + "learning_rate": 2.1976203856815123e-07, + "loss": 0.0486, + "reward": 1.2321429252624512, + "reward_std": 0.21796510741114616, + "rewards/accuracy_reward": 0.2566964402794838, + "rewards/format_reward": 0.9754464775323868, + "step": 2631 + }, + { + "completion_length": 951.0736999511719, + "epoch": 0.786199686356508, + "grad_norm": 1.5704375505447388, + "kl": 0.904296875, + "learning_rate": 2.1944338175900562e-07, + "loss": 0.0626, + "reward": 1.1227678954601288, + "reward_std": 0.23466235026717186, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.964285746216774, + "step": 2632 + }, + { + "completion_length": 890.0312805175781, + "epoch": 0.7864983944440296, + "grad_norm": 1.7668895721435547, + "kl": 0.7470703125, + "learning_rate": 2.191250845630625e-07, + "loss": 0.0127, + "reward": 1.1562500596046448, + "reward_std": 0.28126027807593346, + "rewards/accuracy_reward": 0.1919642984867096, + "rewards/format_reward": 0.9642857611179352, + "step": 2633 + }, + { + "completion_length": 810.0692291259766, + "epoch": 0.786797102531551, + "grad_norm": 0.9106153249740601, + "kl": 0.58203125, + "learning_rate": 2.1880714732659805e-07, + "loss": 0.0155, + "reward": 1.2031250596046448, + "reward_std": 0.19988244026899338, + "rewards/accuracy_reward": 0.2299107201397419, + "rewards/format_reward": 0.973214328289032, + "step": 2634 + }, + { + "completion_length": 788.2455749511719, + "epoch": 0.7870958106190725, + "grad_norm": 0.9171299338340759, + "kl": 0.744140625, + "learning_rate": 2.1848957039549715e-07, + "loss": 0.0383, + "reward": 1.0468750298023224, + "reward_std": 0.19364333525300026, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.9620536118745804, + "step": 2635 + }, + { + "completion_length": 821.6295013427734, + "epoch": 0.7873945187065939, + "grad_norm": 1.3913921117782593, + "kl": 0.53759765625, + "learning_rate": 2.18172354115252e-07, + "loss": 0.0178, + "reward": 1.1428571939468384, + "reward_std": 0.24271226301789284, + "rewards/accuracy_reward": 0.17187501350417733, + "rewards/format_reward": 0.9709821790456772, + "step": 2636 + }, + { + "completion_length": 882.4263763427734, + "epoch": 0.7876932267941155, + "grad_norm": 0.8996544480323792, + "kl": 0.56591796875, + "learning_rate": 2.1785549883096293e-07, + "loss": 0.0012, + "reward": 1.1383928954601288, + "reward_std": 0.18202587310224771, + "rewards/accuracy_reward": 0.16964286752045155, + "rewards/format_reward": 0.9687500447034836, + "step": 2637 + }, + { + "completion_length": 911.9129943847656, + "epoch": 0.7879919348816369, + "grad_norm": 1.7457506656646729, + "kl": 0.6669921875, + "learning_rate": 2.1753900488733767e-07, + "loss": 0.0395, + "reward": 1.113839328289032, + "reward_std": 0.22720512375235558, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.9687500447034836, + "step": 2638 + }, + { + "completion_length": 858.5223541259766, + "epoch": 0.7882906429691584, + "grad_norm": 1.4310153722763062, + "kl": 0.6630859375, + "learning_rate": 2.1722287262869022e-07, + "loss": 0.0112, + "reward": 1.2142857611179352, + "reward_std": 0.24249618127942085, + "rewards/accuracy_reward": 0.2455357313156128, + "rewards/format_reward": 0.9687500447034836, + "step": 2639 + }, + { + "completion_length": 894.5804138183594, + "epoch": 0.7885893510566798, + "grad_norm": 1.2076480388641357, + "kl": 0.57666015625, + "learning_rate": 2.1690710239894172e-07, + "loss": 0.018, + "reward": 1.1718750298023224, + "reward_std": 0.261619932949543, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.9620536118745804, + "step": 2640 + }, + { + "completion_length": 897.5893249511719, + "epoch": 0.7888880591442013, + "grad_norm": 0.8915324211120605, + "kl": 0.44384765625, + "learning_rate": 2.1659169454161935e-07, + "loss": -0.0212, + "reward": 1.0580357611179352, + "reward_std": 0.2365574687719345, + "rewards/accuracy_reward": 0.10267857555299997, + "rewards/format_reward": 0.955357164144516, + "step": 2641 + }, + { + "completion_length": 804.0513763427734, + "epoch": 0.7891867672317228, + "grad_norm": 1.5388606786727905, + "kl": 0.84765625, + "learning_rate": 2.162766493998556e-07, + "loss": 0.0493, + "reward": 1.0959821790456772, + "reward_std": 0.2814883030951023, + "rewards/accuracy_reward": 0.14732143585570157, + "rewards/format_reward": 0.9486607611179352, + "step": 2642 + }, + { + "completion_length": 902.6250305175781, + "epoch": 0.7894854753192443, + "grad_norm": 0.6219817996025085, + "kl": 0.65869140625, + "learning_rate": 2.1596196731638904e-07, + "loss": 0.0285, + "reward": 1.145089328289032, + "reward_std": 0.15174361877143383, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9754464626312256, + "step": 2643 + }, + { + "completion_length": 843.4375457763672, + "epoch": 0.7897841834067657, + "grad_norm": 0.7730607390403748, + "kl": 0.54248046875, + "learning_rate": 2.156476486335627e-07, + "loss": 0.0367, + "reward": 1.082589328289032, + "reward_std": 0.1791204698383808, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.9821428954601288, + "step": 2644 + }, + { + "completion_length": 824.1719207763672, + "epoch": 0.7900828914942872, + "grad_norm": 1.4788109064102173, + "kl": 0.734375, + "learning_rate": 2.1533369369332454e-07, + "loss": 0.0158, + "reward": 1.1116071790456772, + "reward_std": 0.21201767399907112, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.957589328289032, + "step": 2645 + }, + { + "completion_length": 887.8750305175781, + "epoch": 0.7903815995818086, + "grad_norm": 1.393803358078003, + "kl": 0.8564453125, + "learning_rate": 2.1502010283722698e-07, + "loss": 0.0474, + "reward": 1.1495536267757416, + "reward_std": 0.2561633735895157, + "rewards/accuracy_reward": 0.1986607275903225, + "rewards/format_reward": 0.9508928954601288, + "step": 2646 + }, + { + "completion_length": 977.6897735595703, + "epoch": 0.7906803076693302, + "grad_norm": 1.3761155605316162, + "kl": 0.6708984375, + "learning_rate": 2.1470687640642588e-07, + "loss": 0.0198, + "reward": 1.1205357611179352, + "reward_std": 0.18021680787205696, + "rewards/accuracy_reward": 0.16294643515720963, + "rewards/format_reward": 0.957589328289032, + "step": 2647 + }, + { + "completion_length": 789.6272583007812, + "epoch": 0.7909790157568516, + "grad_norm": 1.9008435010910034, + "kl": 0.970703125, + "learning_rate": 2.1439401474168095e-07, + "loss": 0.0485, + "reward": 1.1450892984867096, + "reward_std": 0.23727620020508766, + "rewards/accuracy_reward": 0.1830357275903225, + "rewards/format_reward": 0.9620536118745804, + "step": 2648 + }, + { + "completion_length": 767.9553833007812, + "epoch": 0.7912777238443731, + "grad_norm": 1.0835508108139038, + "kl": 0.7978515625, + "learning_rate": 2.1408151818335518e-07, + "loss": 0.0775, + "reward": 1.2120536267757416, + "reward_std": 0.2850298397243023, + "rewards/accuracy_reward": 0.2589285783469677, + "rewards/format_reward": 0.9531250298023224, + "step": 2649 + }, + { + "completion_length": 895.5156555175781, + "epoch": 0.7915764319318945, + "grad_norm": 1.2171136140823364, + "kl": 0.7822265625, + "learning_rate": 2.13769387071414e-07, + "loss": 0.0357, + "reward": 1.1428571939468384, + "reward_std": 0.27800512686371803, + "rewards/accuracy_reward": 0.18526786286383867, + "rewards/format_reward": 0.957589328289032, + "step": 2650 + }, + { + "completion_length": 845.6228179931641, + "epoch": 0.7918751400194161, + "grad_norm": 1.1946042776107788, + "kl": 0.767578125, + "learning_rate": 2.1345762174542553e-07, + "loss": -0.0033, + "reward": 1.1473214626312256, + "reward_std": 0.1582757756114006, + "rewards/accuracy_reward": 0.16741072107106447, + "rewards/format_reward": 0.9799107611179352, + "step": 2651 + }, + { + "completion_length": 886.0491333007812, + "epoch": 0.7921738481069375, + "grad_norm": 1.6710025072097778, + "kl": 0.8720703125, + "learning_rate": 2.1314622254456e-07, + "loss": 0.0347, + "reward": 1.0513393133878708, + "reward_std": 0.2559089884161949, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.941964328289032, + "step": 2652 + }, + { + "completion_length": 897.0736846923828, + "epoch": 0.792472556194459, + "grad_norm": 1.7456914186477661, + "kl": 0.673828125, + "learning_rate": 2.12835189807589e-07, + "loss": 0.0408, + "reward": 1.0848214626312256, + "reward_std": 0.2332698032259941, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.9642857611179352, + "step": 2653 + }, + { + "completion_length": 838.6719055175781, + "epoch": 0.7927712642819804, + "grad_norm": 1.983034372329712, + "kl": 0.7177734375, + "learning_rate": 2.1252452387288576e-07, + "loss": 0.0515, + "reward": 1.0401786267757416, + "reward_std": 0.13753791712224483, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.9709821939468384, + "step": 2654 + }, + { + "completion_length": 876.4464721679688, + "epoch": 0.7930699723695019, + "grad_norm": 1.2412675619125366, + "kl": 0.79931640625, + "learning_rate": 2.1221422507842458e-07, + "loss": 0.027, + "reward": 1.0937500447034836, + "reward_std": 0.2512327618896961, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.9531250447034836, + "step": 2655 + }, + { + "completion_length": 869.6786041259766, + "epoch": 0.7933686804570234, + "grad_norm": 2.170870542526245, + "kl": 0.8427734375, + "learning_rate": 2.119042937617798e-07, + "loss": 0.0445, + "reward": 1.0848214626312256, + "reward_std": 0.23899054527282715, + "rewards/accuracy_reward": 0.12500000721774995, + "rewards/format_reward": 0.9598214775323868, + "step": 2656 + }, + { + "completion_length": 836.9397583007812, + "epoch": 0.7936673885445449, + "grad_norm": 1.436734914779663, + "kl": 1.0244140625, + "learning_rate": 2.1159473026012663e-07, + "loss": 0.0462, + "reward": 1.0959821939468384, + "reward_std": 0.264739952981472, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9464286118745804, + "step": 2657 + }, + { + "completion_length": 832.6897735595703, + "epoch": 0.7939660966320663, + "grad_norm": 1.777143955230713, + "kl": 1.02734375, + "learning_rate": 2.1128553491023948e-07, + "loss": 0.0274, + "reward": 1.1696429252624512, + "reward_std": 0.29805444926023483, + "rewards/accuracy_reward": 0.22098215017467737, + "rewards/format_reward": 0.9486607611179352, + "step": 2658 + }, + { + "completion_length": 903.7187805175781, + "epoch": 0.7942648047195878, + "grad_norm": 1.8385860919952393, + "kl": 1.23828125, + "learning_rate": 2.1097670804849274e-07, + "loss": 0.0494, + "reward": 1.104910746216774, + "reward_std": 0.2587130330502987, + "rewards/accuracy_reward": 0.16294643515720963, + "rewards/format_reward": 0.941964328289032, + "step": 2659 + }, + { + "completion_length": 824.8906555175781, + "epoch": 0.7945635128071092, + "grad_norm": 1.2806156873703003, + "kl": 1.1337890625, + "learning_rate": 2.1066825001086e-07, + "loss": 0.0411, + "reward": 1.118303656578064, + "reward_std": 0.2373046688735485, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.9575893431901932, + "step": 2660 + }, + { + "completion_length": 842.0960235595703, + "epoch": 0.7948622208946308, + "grad_norm": 1.600911021232605, + "kl": 0.796875, + "learning_rate": 2.1036016113291298e-07, + "loss": 0.0534, + "reward": 1.2142857611179352, + "reward_std": 0.3004831522703171, + "rewards/accuracy_reward": 0.2455357275903225, + "rewards/format_reward": 0.9687500447034836, + "step": 2661 + }, + { + "completion_length": 935.0335235595703, + "epoch": 0.7951609289821522, + "grad_norm": 1.4116771221160889, + "kl": 0.7041015625, + "learning_rate": 2.1005244174982236e-07, + "loss": 0.0416, + "reward": 1.1272321939468384, + "reward_std": 0.21337690949440002, + "rewards/accuracy_reward": 0.15178572130389512, + "rewards/format_reward": 0.9754464775323868, + "step": 2662 + }, + { + "completion_length": 821.5803833007812, + "epoch": 0.7954596370696737, + "grad_norm": 2.0325310230255127, + "kl": 0.830078125, + "learning_rate": 2.0974509219635684e-07, + "loss": 0.0221, + "reward": 1.1562500596046448, + "reward_std": 0.27257684245705605, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9575893133878708, + "step": 2663 + }, + { + "completion_length": 867.4464721679688, + "epoch": 0.7957583451571951, + "grad_norm": 1.6634162664413452, + "kl": 1.1220703125, + "learning_rate": 2.0943811280688224e-07, + "loss": 0.0467, + "reward": 1.1741071939468384, + "reward_std": 0.2317611686885357, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9508928954601288, + "step": 2664 + }, + { + "completion_length": 906.3616333007812, + "epoch": 0.7960570532447167, + "grad_norm": 3.0785491466522217, + "kl": 1.22265625, + "learning_rate": 2.0913150391536232e-07, + "loss": 0.0862, + "reward": 1.1718750298023224, + "reward_std": 0.26766424253582954, + "rewards/accuracy_reward": 0.21428572945296764, + "rewards/format_reward": 0.957589328289032, + "step": 2665 + }, + { + "completion_length": 930.9107360839844, + "epoch": 0.7963557613322381, + "grad_norm": 1.928654432296753, + "kl": 0.998046875, + "learning_rate": 2.088252658553576e-07, + "loss": 0.0479, + "reward": 1.0491071939468384, + "reward_std": 0.16846011951565742, + "rewards/accuracy_reward": 0.08035714901052415, + "rewards/format_reward": 0.9687500447034836, + "step": 2666 + }, + { + "completion_length": 874.9911041259766, + "epoch": 0.7966544694197596, + "grad_norm": 1.787838339805603, + "kl": 0.85791015625, + "learning_rate": 2.085193989600247e-07, + "loss": 0.0065, + "reward": 1.129464328289032, + "reward_std": 0.26150383800268173, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.9620536118745804, + "step": 2667 + }, + { + "completion_length": 849.9397583007812, + "epoch": 0.796953177507281, + "grad_norm": 1.123774766921997, + "kl": 0.5908203125, + "learning_rate": 2.0821390356211706e-07, + "loss": 0.031, + "reward": 1.0267857760190964, + "reward_std": 0.11452548764646053, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.9754464775323868, + "step": 2668 + }, + { + "completion_length": 860.8370819091797, + "epoch": 0.7972518855948025, + "grad_norm": 9.004196166992188, + "kl": 0.8564453125, + "learning_rate": 2.0790877999398385e-07, + "loss": 0.0481, + "reward": 1.1339286267757416, + "reward_std": 0.265158798545599, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9531250298023224, + "step": 2669 + }, + { + "completion_length": 864.7344207763672, + "epoch": 0.797550593682324, + "grad_norm": 0.6338464617729187, + "kl": 0.58642578125, + "learning_rate": 2.0760402858756932e-07, + "loss": 0.0303, + "reward": 1.1674107909202576, + "reward_std": 0.27528391033411026, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9687500447034836, + "step": 2670 + }, + { + "completion_length": 870.7991485595703, + "epoch": 0.7978493017698454, + "grad_norm": 0.8846583962440491, + "kl": 0.6142578125, + "learning_rate": 2.0729964967441344e-07, + "loss": 0.0126, + "reward": 1.1406250596046448, + "reward_std": 0.21074498537927866, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.9620535969734192, + "step": 2671 + }, + { + "completion_length": 812.4330596923828, + "epoch": 0.7981480098573669, + "grad_norm": 1.2790995836257935, + "kl": 0.671875, + "learning_rate": 2.0699564358565026e-07, + "loss": 0.0877, + "reward": 1.1272322237491608, + "reward_std": 0.2235286980867386, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.9709821939468384, + "step": 2672 + }, + { + "completion_length": 926.0223541259766, + "epoch": 0.7984467179448883, + "grad_norm": 1.0909944772720337, + "kl": 0.5302734375, + "learning_rate": 2.066920106520089e-07, + "loss": 0.0249, + "reward": 1.1138392984867096, + "reward_std": 0.179386168718338, + "rewards/accuracy_reward": 0.1517857222352177, + "rewards/format_reward": 0.9620536118745804, + "step": 2673 + }, + { + "completion_length": 864.0781707763672, + "epoch": 0.7987454260324098, + "grad_norm": 1.4701783657073975, + "kl": 0.43701171875, + "learning_rate": 2.0638875120381228e-07, + "loss": 0.0206, + "reward": 1.1763393580913544, + "reward_std": 0.23909785971045494, + "rewards/accuracy_reward": 0.20312500931322575, + "rewards/format_reward": 0.973214328289032, + "step": 2674 + }, + { + "completion_length": 869.5491485595703, + "epoch": 0.7990441341199312, + "grad_norm": 1.2104449272155762, + "kl": 0.5693359375, + "learning_rate": 2.0608586557097667e-07, + "loss": 0.0144, + "reward": 1.194196492433548, + "reward_std": 0.19124314188957214, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9709821939468384, + "step": 2675 + }, + { + "completion_length": 926.9464874267578, + "epoch": 0.7993428422074528, + "grad_norm": 0.8896836042404175, + "kl": 0.66552734375, + "learning_rate": 2.057833540830121e-07, + "loss": 0.0694, + "reward": 1.0803571939468384, + "reward_std": 0.22549285739660263, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.9709821939468384, + "step": 2676 + }, + { + "completion_length": 927.1339721679688, + "epoch": 0.7996415502949742, + "grad_norm": 0.6499295830726624, + "kl": 0.4130859375, + "learning_rate": 2.054812170690216e-07, + "loss": 0.0255, + "reward": 1.1339285969734192, + "reward_std": 0.23286738246679306, + "rewards/accuracy_reward": 0.16294643469154835, + "rewards/format_reward": 0.9709821790456772, + "step": 2677 + }, + { + "completion_length": 850.2455749511719, + "epoch": 0.7999402583824957, + "grad_norm": 1.0226771831512451, + "kl": 0.39501953125, + "learning_rate": 2.0517945485770031e-07, + "loss": 0.0232, + "reward": 1.1294643580913544, + "reward_std": 0.14384849835187197, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.9709821939468384, + "step": 2678 + }, + { + "completion_length": 980.5022888183594, + "epoch": 0.8002389664700171, + "grad_norm": 1.5778982639312744, + "kl": 0.53955078125, + "learning_rate": 2.0487806777733608e-07, + "loss": 0.0488, + "reward": 1.0691964626312256, + "reward_std": 0.20991234481334686, + "rewards/accuracy_reward": 0.10937500791624188, + "rewards/format_reward": 0.9598214626312256, + "step": 2679 + }, + { + "completion_length": 872.3504943847656, + "epoch": 0.8005376745575387, + "grad_norm": 0.9891571402549744, + "kl": 0.413330078125, + "learning_rate": 2.0457705615580862e-07, + "loss": 0.0386, + "reward": 1.082589328289032, + "reward_std": 0.20259209722280502, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.96651791036129, + "step": 2680 + }, + { + "completion_length": 958.3772888183594, + "epoch": 0.8008363826450601, + "grad_norm": 1.1473026275634766, + "kl": 0.568359375, + "learning_rate": 2.042764203205889e-07, + "loss": 0.0227, + "reward": 1.0580357611179352, + "reward_std": 0.1634335145354271, + "rewards/accuracy_reward": 0.0937500074505806, + "rewards/format_reward": 0.9642857611179352, + "step": 2681 + }, + { + "completion_length": 836.7232666015625, + "epoch": 0.8011350907325816, + "grad_norm": 1.4604127407073975, + "kl": 0.4541015625, + "learning_rate": 2.039761605987394e-07, + "loss": 0.013, + "reward": 1.1316964626312256, + "reward_std": 0.1932721845805645, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.970982164144516, + "step": 2682 + }, + { + "completion_length": 901.0937957763672, + "epoch": 0.801433798820103, + "grad_norm": 1.6352264881134033, + "kl": 0.298828125, + "learning_rate": 2.0367627731691312e-07, + "loss": 0.0309, + "reward": 1.131696492433548, + "reward_std": 0.22130280174314976, + "rewards/accuracy_reward": 0.1674107275903225, + "rewards/format_reward": 0.964285746216774, + "step": 2683 + }, + { + "completion_length": 907.2969207763672, + "epoch": 0.8017325069076245, + "grad_norm": 1.483684778213501, + "kl": 0.42529296875, + "learning_rate": 2.0337677080135373e-07, + "loss": -0.0136, + "reward": 1.0602679252624512, + "reward_std": 0.2441001534461975, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.9575893133878708, + "step": 2684 + }, + { + "completion_length": 829.4152069091797, + "epoch": 0.802031214995146, + "grad_norm": 1.0373191833496094, + "kl": 0.36376953125, + "learning_rate": 2.0307764137789508e-07, + "loss": 0.0162, + "reward": 1.274553656578064, + "reward_std": 0.24351216480135918, + "rewards/accuracy_reward": 0.3035714477300644, + "rewards/format_reward": 0.9709822088479996, + "step": 2685 + }, + { + "completion_length": 889.7388763427734, + "epoch": 0.8023299230826675, + "grad_norm": 1.2189397811889648, + "kl": 0.35302734375, + "learning_rate": 2.0277888937196042e-07, + "loss": 0.0059, + "reward": 1.1183036267757416, + "reward_std": 0.26735539361834526, + "rewards/accuracy_reward": 0.15848215110599995, + "rewards/format_reward": 0.9598214775323868, + "step": 2686 + }, + { + "completion_length": 786.9643249511719, + "epoch": 0.8026286311701889, + "grad_norm": 0.6706370711326599, + "kl": 0.5478515625, + "learning_rate": 2.0248051510856285e-07, + "loss": 0.0238, + "reward": 1.1428571939468384, + "reward_std": 0.24577700719237328, + "rewards/accuracy_reward": 0.19419643469154835, + "rewards/format_reward": 0.9486607611179352, + "step": 2687 + }, + { + "completion_length": 855.9888763427734, + "epoch": 0.8029273392577104, + "grad_norm": 1.373241901397705, + "kl": 0.41455078125, + "learning_rate": 2.0218251891230436e-07, + "loss": -0.0042, + "reward": 1.2299107611179352, + "reward_std": 0.2686628997325897, + "rewards/accuracy_reward": 0.2700892984867096, + "rewards/format_reward": 0.9598214626312256, + "step": 2688 + }, + { + "completion_length": 886.8995971679688, + "epoch": 0.8032260473452318, + "grad_norm": 1.120255708694458, + "kl": 0.30615234375, + "learning_rate": 2.018849011073754e-07, + "loss": 0.0113, + "reward": 1.1093750596046448, + "reward_std": 0.1681499220430851, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.9732143431901932, + "step": 2689 + }, + { + "completion_length": 866.4286041259766, + "epoch": 0.8035247554327534, + "grad_norm": 1.2334342002868652, + "kl": 0.33642578125, + "learning_rate": 2.015876620175551e-07, + "loss": 0.0308, + "reward": 1.1562500596046448, + "reward_std": 0.25376659631729126, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9754464626312256, + "step": 2690 + }, + { + "completion_length": 924.7612152099609, + "epoch": 0.8038234635202748, + "grad_norm": 0.908273458480835, + "kl": 0.50732421875, + "learning_rate": 2.0129080196621058e-07, + "loss": 0.0072, + "reward": 1.0825893580913544, + "reward_std": 0.2589210756123066, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.9642857611179352, + "step": 2691 + }, + { + "completion_length": 844.8058319091797, + "epoch": 0.8041221716077963, + "grad_norm": 1.7557907104492188, + "kl": 0.47314453125, + "learning_rate": 2.0099432127629622e-07, + "loss": -0.0017, + "reward": 1.1116071939468384, + "reward_std": 0.19572917744517326, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9598214626312256, + "step": 2692 + }, + { + "completion_length": 787.122802734375, + "epoch": 0.8044208796953177, + "grad_norm": 1.0269927978515625, + "kl": 0.56103515625, + "learning_rate": 2.0069822027035406e-07, + "loss": 0.0397, + "reward": 1.1093750447034836, + "reward_std": 0.26879726722836494, + "rewards/accuracy_reward": 0.15625000605359674, + "rewards/format_reward": 0.9531250447034836, + "step": 2693 + }, + { + "completion_length": 959.6942596435547, + "epoch": 0.8047195877828393, + "grad_norm": 0.922652542591095, + "kl": 0.4306640625, + "learning_rate": 2.004024992705131e-07, + "loss": 0.0231, + "reward": 1.2165179252624512, + "reward_std": 0.24117598682641983, + "rewards/accuracy_reward": 0.2455357238650322, + "rewards/format_reward": 0.9709821790456772, + "step": 2694 + }, + { + "completion_length": 940.9620971679688, + "epoch": 0.8050182958703607, + "grad_norm": 0.7023257613182068, + "kl": 0.5458984375, + "learning_rate": 2.0010715859848865e-07, + "loss": 0.0033, + "reward": 1.1584821939468384, + "reward_std": 0.2697606347501278, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.957589328289032, + "step": 2695 + }, + { + "completion_length": 919.5111999511719, + "epoch": 0.8053170039578822, + "grad_norm": 1.4478533267974854, + "kl": 0.50927734375, + "learning_rate": 1.998121985755825e-07, + "loss": 0.0293, + "reward": 1.0133928954601288, + "reward_std": 0.17921891529113054, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.9598214775323868, + "step": 2696 + }, + { + "completion_length": 850.8862152099609, + "epoch": 0.8056157120454036, + "grad_norm": 1.249863624572754, + "kl": 0.5966796875, + "learning_rate": 1.9951761952268208e-07, + "loss": 0.0122, + "reward": 1.1250000596046448, + "reward_std": 0.2110581174492836, + "rewards/accuracy_reward": 0.16294644074514508, + "rewards/format_reward": 0.9620536118745804, + "step": 2697 + }, + { + "completion_length": 906.888427734375, + "epoch": 0.8059144201329251, + "grad_norm": 1.3903517723083496, + "kl": 0.712890625, + "learning_rate": 1.9922342176026072e-07, + "loss": 0.0249, + "reward": 1.1696429252624512, + "reward_std": 0.2340632677078247, + "rewards/accuracy_reward": 0.2008928693830967, + "rewards/format_reward": 0.9687500298023224, + "step": 2698 + }, + { + "completion_length": 844.5335083007812, + "epoch": 0.8062131282204466, + "grad_norm": 1.6757599115371704, + "kl": 0.6376953125, + "learning_rate": 1.9892960560837677e-07, + "loss": 0.0641, + "reward": 1.1406250447034836, + "reward_std": 0.28053721971809864, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.9553571790456772, + "step": 2699 + }, + { + "completion_length": 776.138427734375, + "epoch": 0.8065118363079681, + "grad_norm": 1.3387343883514404, + "kl": 0.73046875, + "learning_rate": 1.986361713866732e-07, + "loss": 0.0394, + "reward": 1.1830357313156128, + "reward_std": 0.2525516748428345, + "rewards/accuracy_reward": 0.2254464365541935, + "rewards/format_reward": 0.9575893431901932, + "step": 2700 + }, + { + "completion_length": 789.5201263427734, + "epoch": 0.8068105443954895, + "grad_norm": 1.7999494075775146, + "kl": 0.638671875, + "learning_rate": 1.983431194143778e-07, + "loss": 0.081, + "reward": 1.209821492433548, + "reward_std": 0.29962145537137985, + "rewards/accuracy_reward": 0.2522321529686451, + "rewards/format_reward": 0.9575893431901932, + "step": 2701 + }, + { + "completion_length": 851.5848541259766, + "epoch": 0.807109252483011, + "grad_norm": 1.2193068265914917, + "kl": 0.890625, + "learning_rate": 1.980504500103025e-07, + "loss": 0.07, + "reward": 1.1071428954601288, + "reward_std": 0.16962470300495625, + "rewards/accuracy_reward": 0.14508928917348385, + "rewards/format_reward": 0.9620536118745804, + "step": 2702 + }, + { + "completion_length": 921.2098693847656, + "epoch": 0.8074079605705324, + "grad_norm": 1.3692446947097778, + "kl": 0.68408203125, + "learning_rate": 1.9775816349284276e-07, + "loss": 0.0264, + "reward": 1.0870536118745804, + "reward_std": 0.2186126708984375, + "rewards/accuracy_reward": 0.12723214784637094, + "rewards/format_reward": 0.9598214775323868, + "step": 2703 + }, + { + "completion_length": 918.7790679931641, + "epoch": 0.807706668658054, + "grad_norm": 1.3051985502243042, + "kl": 1.0048828125, + "learning_rate": 1.9746626017997769e-07, + "loss": 0.0462, + "reward": 1.1093750596046448, + "reward_std": 0.3042440116405487, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.941964328289032, + "step": 2704 + }, + { + "completion_length": 977.9888763427734, + "epoch": 0.8080053767455754, + "grad_norm": 1.0075900554656982, + "kl": 0.4833984375, + "learning_rate": 1.971747403892697e-07, + "loss": 0.0304, + "reward": 1.0825893580913544, + "reward_std": 0.2292088270187378, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.9709821790456772, + "step": 2705 + }, + { + "completion_length": 906.6674499511719, + "epoch": 0.8083040848330969, + "grad_norm": 1.7469936609268188, + "kl": 0.884765625, + "learning_rate": 1.9688360443786336e-07, + "loss": 0.0778, + "reward": 1.1026786267757416, + "reward_std": 0.21750776283442974, + "rewards/accuracy_reward": 0.12500000302679837, + "rewards/format_reward": 0.9776786118745804, + "step": 2706 + }, + { + "completion_length": 951.6920013427734, + "epoch": 0.8086027929206183, + "grad_norm": 1.3092228174209595, + "kl": 0.82568359375, + "learning_rate": 1.9659285264248636e-07, + "loss": 0.0508, + "reward": 1.160714328289032, + "reward_std": 0.22125142440199852, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/format_reward": 0.9732143133878708, + "step": 2707 + }, + { + "completion_length": 810.9286041259766, + "epoch": 0.8089015010081398, + "grad_norm": 2.039433717727661, + "kl": 0.9384765625, + "learning_rate": 1.9630248531944812e-07, + "loss": 0.0423, + "reward": 1.1875000298023224, + "reward_std": 0.2936412878334522, + "rewards/accuracy_reward": 0.2142857275903225, + "rewards/format_reward": 0.973214328289032, + "step": 2708 + }, + { + "completion_length": 860.2812805175781, + "epoch": 0.8092002090956613, + "grad_norm": 1.6822632551193237, + "kl": 0.6962890625, + "learning_rate": 1.9601250278463964e-07, + "loss": 0.0648, + "reward": 1.1361607611179352, + "reward_std": 0.20394105836749077, + "rewards/accuracy_reward": 0.16517858020961285, + "rewards/format_reward": 0.9709821939468384, + "step": 2709 + }, + { + "completion_length": 910.9330749511719, + "epoch": 0.8094989171831828, + "grad_norm": 1.6611422300338745, + "kl": 0.68212890625, + "learning_rate": 1.9572290535353383e-07, + "loss": 0.0008, + "reward": 1.0803571790456772, + "reward_std": 0.1935687456279993, + "rewards/accuracy_reward": 0.11607143376022577, + "rewards/format_reward": 0.9642857611179352, + "step": 2710 + }, + { + "completion_length": 934.8460388183594, + "epoch": 0.8097976252707042, + "grad_norm": 1.3494868278503418, + "kl": 1.052734375, + "learning_rate": 1.9543369334118392e-07, + "loss": 0.0583, + "reward": 1.0357143431901932, + "reward_std": 0.23002035915851593, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.9531250447034836, + "step": 2711 + }, + { + "completion_length": 894.216552734375, + "epoch": 0.8100963333582257, + "grad_norm": 1.0259865522384644, + "kl": 0.810546875, + "learning_rate": 1.9514486706222443e-07, + "loss": 0.03, + "reward": 1.0513393580913544, + "reward_std": 0.2441982999444008, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.948660746216774, + "step": 2712 + }, + { + "completion_length": 935.029052734375, + "epoch": 0.8103950414457471, + "grad_norm": 1.9314355850219727, + "kl": 0.7158203125, + "learning_rate": 1.9485642683087017e-07, + "loss": -0.0066, + "reward": 1.1272321939468384, + "reward_std": 0.2982868403196335, + "rewards/accuracy_reward": 0.1741071566939354, + "rewards/format_reward": 0.9531250447034836, + "step": 2713 + }, + { + "completion_length": 957.2523040771484, + "epoch": 0.8106937495332686, + "grad_norm": 1.7801913022994995, + "kl": 0.880859375, + "learning_rate": 1.9456837296091557e-07, + "loss": 0.0246, + "reward": 1.1316964775323868, + "reward_std": 0.20409822091460228, + "rewards/accuracy_reward": 0.17857143003493547, + "rewards/format_reward": 0.9531250447034836, + "step": 2714 + }, + { + "completion_length": 832.7277221679688, + "epoch": 0.8109924576207901, + "grad_norm": 1.4802353382110596, + "kl": 1.1044921875, + "learning_rate": 1.9428070576573513e-07, + "loss": 0.0653, + "reward": 1.178571492433548, + "reward_std": 0.2527272179722786, + "rewards/accuracy_reward": 0.227678582072258, + "rewards/format_reward": 0.95089291036129, + "step": 2715 + }, + { + "completion_length": 808.9732513427734, + "epoch": 0.8112911657083115, + "grad_norm": 1.2906980514526367, + "kl": 0.619140625, + "learning_rate": 1.9399342555828277e-07, + "loss": 0.0252, + "reward": 1.1830357611179352, + "reward_std": 0.08458778262138367, + "rewards/accuracy_reward": 0.207589291036129, + "rewards/format_reward": 0.9754464626312256, + "step": 2716 + }, + { + "completion_length": 952.0915679931641, + "epoch": 0.811589873795833, + "grad_norm": 1.043923020362854, + "kl": 0.64208984375, + "learning_rate": 1.9370653265109083e-07, + "loss": 0.0224, + "reward": 1.0558036267757416, + "reward_std": 0.13583817146718502, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.9709821790456772, + "step": 2717 + }, + { + "completion_length": 894.0380096435547, + "epoch": 0.8118885818833544, + "grad_norm": 0.6644864678382874, + "kl": 0.51904296875, + "learning_rate": 1.9342002735627083e-07, + "loss": 0.0188, + "reward": 1.1517857611179352, + "reward_std": 0.18502293713390827, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.9754464775323868, + "step": 2718 + }, + { + "completion_length": 891.1540679931641, + "epoch": 0.812187289970876, + "grad_norm": 3.1624715328216553, + "kl": 0.7548828125, + "learning_rate": 1.9313390998551264e-07, + "loss": 0.0447, + "reward": 1.1651785969734192, + "reward_std": 0.2647279053926468, + "rewards/accuracy_reward": 0.20758929336443543, + "rewards/format_reward": 0.957589328289032, + "step": 2719 + }, + { + "completion_length": 889.4955749511719, + "epoch": 0.8124859980583974, + "grad_norm": 1.2175384759902954, + "kl": 0.445068359375, + "learning_rate": 1.9284818085008361e-07, + "loss": 0.0179, + "reward": 1.2165179252624512, + "reward_std": 0.1959007941186428, + "rewards/accuracy_reward": 0.2433035783469677, + "rewards/format_reward": 0.9732143133878708, + "step": 2720 + }, + { + "completion_length": 798.8571929931641, + "epoch": 0.8127847061459189, + "grad_norm": 1.1765435934066772, + "kl": 0.5732421875, + "learning_rate": 1.925628402608292e-07, + "loss": 0.0222, + "reward": 1.0736607313156128, + "reward_std": 0.19247103855013847, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.9665178954601288, + "step": 2721 + }, + { + "completion_length": 942.1049499511719, + "epoch": 0.8130834142334403, + "grad_norm": 2.355863332748413, + "kl": 0.6455078125, + "learning_rate": 1.9227788852817212e-07, + "loss": 0.0088, + "reward": 1.1741071939468384, + "reward_std": 0.2985411826521158, + "rewards/accuracy_reward": 0.22098215040750802, + "rewards/format_reward": 0.9531250298023224, + "step": 2722 + }, + { + "completion_length": 874.2522583007812, + "epoch": 0.8133821223209619, + "grad_norm": 1.7238844633102417, + "kl": 0.607421875, + "learning_rate": 1.919933259621116e-07, + "loss": 0.0448, + "reward": 1.1964286267757416, + "reward_std": 0.228895902633667, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.9553571939468384, + "step": 2723 + }, + { + "completion_length": 799.7545013427734, + "epoch": 0.8136808304084833, + "grad_norm": 1.3173574209213257, + "kl": 0.7490234375, + "learning_rate": 1.9170915287222417e-07, + "loss": 0.0393, + "reward": 1.1183036267757416, + "reward_std": 0.225064218044281, + "rewards/accuracy_reward": 0.1629464402794838, + "rewards/format_reward": 0.9553571790456772, + "step": 2724 + }, + { + "completion_length": 918.513427734375, + "epoch": 0.8139795384960048, + "grad_norm": 0.8300442099571228, + "kl": 0.7255859375, + "learning_rate": 1.9142536956766197e-07, + "loss": 0.0134, + "reward": 1.0357143580913544, + "reward_std": 0.21101750433444977, + "rewards/accuracy_reward": 0.07812500558793545, + "rewards/format_reward": 0.957589328289032, + "step": 2725 + }, + { + "completion_length": 1006.2679138183594, + "epoch": 0.8142782465835262, + "grad_norm": 0.5699637532234192, + "kl": 0.46435546875, + "learning_rate": 1.911419763571536e-07, + "loss": -0.0277, + "reward": 1.1696428954601288, + "reward_std": 0.21765761822462082, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9709821939468384, + "step": 2726 + }, + { + "completion_length": 994.5580902099609, + "epoch": 0.8145769546710477, + "grad_norm": 0.6807231903076172, + "kl": 0.50927734375, + "learning_rate": 1.9085897354900328e-07, + "loss": 0.0001, + "reward": 1.116071492433548, + "reward_std": 0.14935309812426567, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.970982164144516, + "step": 2727 + }, + { + "completion_length": 883.7924499511719, + "epoch": 0.8148756627585692, + "grad_norm": 1.1490026712417603, + "kl": 0.48681640625, + "learning_rate": 1.9057636145108997e-07, + "loss": 0.0214, + "reward": 1.1517857611179352, + "reward_std": 0.18720350414514542, + "rewards/accuracy_reward": 0.17187500558793545, + "rewards/format_reward": 0.979910746216774, + "step": 2728 + }, + { + "completion_length": 844.3192291259766, + "epoch": 0.8151743708460907, + "grad_norm": 0.5167511105537415, + "kl": 0.541015625, + "learning_rate": 1.902941403708681e-07, + "loss": 0.0258, + "reward": 1.133928656578064, + "reward_std": 0.23958167433738708, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9776786267757416, + "step": 2729 + }, + { + "completion_length": 844.5245971679688, + "epoch": 0.8154730789336121, + "grad_norm": 1.2354172468185425, + "kl": 0.5927734375, + "learning_rate": 1.9001231061536666e-07, + "loss": -0.0028, + "reward": 1.1227678954601288, + "reward_std": 0.21414319053292274, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9687500298023224, + "step": 2730 + }, + { + "completion_length": 978.7032012939453, + "epoch": 0.8157717870211336, + "grad_norm": 1.5376983880996704, + "kl": 0.5947265625, + "learning_rate": 1.897308724911885e-07, + "loss": 0.0224, + "reward": 1.0379464626312256, + "reward_std": 0.18698212876915932, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.9598214626312256, + "step": 2731 + }, + { + "completion_length": 898.2969207763672, + "epoch": 0.816070495108655, + "grad_norm": 1.5049607753753662, + "kl": 0.648681640625, + "learning_rate": 1.89449826304511e-07, + "loss": -0.0035, + "reward": 1.0758929252624512, + "reward_std": 0.20518555119633675, + "rewards/accuracy_reward": 0.11607143376022577, + "rewards/format_reward": 0.9598214775323868, + "step": 2732 + }, + { + "completion_length": 853.7053985595703, + "epoch": 0.8163692031961766, + "grad_norm": 1.556255578994751, + "kl": 0.66943359375, + "learning_rate": 1.891691723610848e-07, + "loss": -0.0237, + "reward": 1.0491071939468384, + "reward_std": 0.19296247977763414, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.9598214775323868, + "step": 2733 + }, + { + "completion_length": 783.5803985595703, + "epoch": 0.816667911283698, + "grad_norm": 0.7447684407234192, + "kl": 0.650390625, + "learning_rate": 1.8888891096623376e-07, + "loss": -0.0063, + "reward": 1.1674107611179352, + "reward_std": 0.18214398622512817, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9776786118745804, + "step": 2734 + }, + { + "completion_length": 926.6317443847656, + "epoch": 0.8169666193712195, + "grad_norm": 1.6589417457580566, + "kl": 0.7236328125, + "learning_rate": 1.8860904242485493e-07, + "loss": 0.0317, + "reward": 1.2008928954601288, + "reward_std": 0.29909849911928177, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.9598214775323868, + "step": 2735 + }, + { + "completion_length": 857.2232513427734, + "epoch": 0.8172653274587409, + "grad_norm": 0.8941437005996704, + "kl": 0.6591796875, + "learning_rate": 1.8832956704141794e-07, + "loss": 0.0098, + "reward": 1.1205357611179352, + "reward_std": 0.1872681425884366, + "rewards/accuracy_reward": 0.15401786658912897, + "rewards/format_reward": 0.9665178954601288, + "step": 2736 + }, + { + "completion_length": 853.9308624267578, + "epoch": 0.8175640355462624, + "grad_norm": 1.735530138015747, + "kl": 0.5283203125, + "learning_rate": 1.880504851199644e-07, + "loss": 0.0216, + "reward": 1.082589328289032, + "reward_std": 0.2234981507062912, + "rewards/accuracy_reward": 0.11607143562287092, + "rewards/format_reward": 0.9665178954601288, + "step": 2737 + }, + { + "completion_length": 899.7165679931641, + "epoch": 0.8178627436337839, + "grad_norm": 1.0998421907424927, + "kl": 0.45947265625, + "learning_rate": 1.8777179696410822e-07, + "loss": 0.0044, + "reward": 1.147321492433548, + "reward_std": 0.25602756440639496, + "rewards/accuracy_reward": 0.1808035832364112, + "rewards/format_reward": 0.9665178954601288, + "step": 2738 + }, + { + "completion_length": 956.4062805175781, + "epoch": 0.8181614517213054, + "grad_norm": 0.9684839844703674, + "kl": 0.40234375, + "learning_rate": 1.874935028770347e-07, + "loss": 0.0024, + "reward": 1.1629464626312256, + "reward_std": 0.19291111081838608, + "rewards/accuracy_reward": 0.18303572107106447, + "rewards/format_reward": 0.9799107313156128, + "step": 2739 + }, + { + "completion_length": 1005.1875457763672, + "epoch": 0.8184601598088268, + "grad_norm": 1.134126901626587, + "kl": 0.673828125, + "learning_rate": 1.872156031615006e-07, + "loss": 0.0112, + "reward": 1.1339286267757416, + "reward_std": 0.2801639512181282, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.9598214775323868, + "step": 2740 + }, + { + "completion_length": 853.0982513427734, + "epoch": 0.8187588678963483, + "grad_norm": 0.6364851593971252, + "kl": 0.428466796875, + "learning_rate": 1.8693809811983366e-07, + "loss": 0.0149, + "reward": 1.1473214626312256, + "reward_std": 0.2314671091735363, + "rewards/accuracy_reward": 0.18303572200238705, + "rewards/format_reward": 0.9642857611179352, + "step": 2741 + }, + { + "completion_length": 885.4643249511719, + "epoch": 0.8190575759838697, + "grad_norm": 1.125819206237793, + "kl": 0.310546875, + "learning_rate": 1.8666098805393198e-07, + "loss": -0.0095, + "reward": 1.1160714626312256, + "reward_std": 0.16242557391524315, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9843750447034836, + "step": 2742 + }, + { + "completion_length": 985.3795166015625, + "epoch": 0.8193562840713913, + "grad_norm": 0.819788932800293, + "kl": 0.560546875, + "learning_rate": 1.8638427326526424e-07, + "loss": 0.0004, + "reward": 1.0892857611179352, + "reward_std": 0.23904490657150745, + "rewards/accuracy_reward": 0.12946429196745157, + "rewards/format_reward": 0.9598214775323868, + "step": 2743 + }, + { + "completion_length": 1033.2455749511719, + "epoch": 0.8196549921589127, + "grad_norm": 0.9552382230758667, + "kl": 0.552734375, + "learning_rate": 1.8610795405486913e-07, + "loss": -0.0015, + "reward": 1.0714286416769028, + "reward_std": 0.21128393337130547, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.95089291036129, + "step": 2744 + }, + { + "completion_length": 928.3393249511719, + "epoch": 0.8199537002464342, + "grad_norm": 0.9890291690826416, + "kl": 0.44921875, + "learning_rate": 1.8583203072335462e-07, + "loss": 0.0237, + "reward": 1.1428571939468384, + "reward_std": 0.23641925677657127, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9620536267757416, + "step": 2745 + }, + { + "completion_length": 911.8683471679688, + "epoch": 0.8202524083339556, + "grad_norm": 0.8015111684799194, + "kl": 0.4013671875, + "learning_rate": 1.855565035708984e-07, + "loss": 0.0402, + "reward": 1.1763393580913544, + "reward_std": 0.18942627683281898, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9776786118745804, + "step": 2746 + }, + { + "completion_length": 921.5536041259766, + "epoch": 0.8205511164214772, + "grad_norm": 0.9669091701507568, + "kl": 0.654296875, + "learning_rate": 1.852813728972471e-07, + "loss": 0.0094, + "reward": 1.0647321939468384, + "reward_std": 0.24870966374874115, + "rewards/accuracy_reward": 0.10714286100119352, + "rewards/format_reward": 0.957589328289032, + "step": 2747 + }, + { + "completion_length": 880.5893096923828, + "epoch": 0.8208498245089986, + "grad_norm": 2.2769014835357666, + "kl": 0.51611328125, + "learning_rate": 1.8500663900171574e-07, + "loss": 0.0181, + "reward": 1.0959821939468384, + "reward_std": 0.25893889740109444, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.9598214626312256, + "step": 2748 + }, + { + "completion_length": 913.9152221679688, + "epoch": 0.8211485325965201, + "grad_norm": 0.9469032287597656, + "kl": 0.40380859375, + "learning_rate": 1.8473230218318792e-07, + "loss": 0.0479, + "reward": 1.0758929252624512, + "reward_std": 0.23057621717453003, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.9687500596046448, + "step": 2749 + }, + { + "completion_length": 889.5111846923828, + "epoch": 0.8214472406840415, + "grad_norm": 1.7501558065414429, + "kl": 0.388671875, + "learning_rate": 1.8445836274011538e-07, + "loss": 0.0292, + "reward": 1.0959821939468384, + "reward_std": 0.20672770589590073, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.964285746216774, + "step": 2750 + }, + { + "completion_length": 877.8460235595703, + "epoch": 0.821745948771563, + "grad_norm": 1.284035563468933, + "kl": 0.7119140625, + "learning_rate": 1.8418482097051713e-07, + "loss": 0.0143, + "reward": 1.2165179252624512, + "reward_std": 0.2643352672457695, + "rewards/accuracy_reward": 0.2522321566939354, + "rewards/format_reward": 0.964285746216774, + "step": 2751 + }, + { + "completion_length": 856.3482513427734, + "epoch": 0.8220446568590845, + "grad_norm": 3.4165444374084473, + "kl": 0.62451171875, + "learning_rate": 1.8391167717198004e-07, + "loss": 0.0349, + "reward": 1.0200893133878708, + "reward_std": 0.2257486879825592, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.941964328289032, + "step": 2752 + }, + { + "completion_length": 953.0089569091797, + "epoch": 0.822343364946606, + "grad_norm": 1.3463319540023804, + "kl": 0.6845703125, + "learning_rate": 1.8363893164165756e-07, + "loss": 0.0365, + "reward": 1.0580357611179352, + "reward_std": 0.1876462223008275, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.9620536118745804, + "step": 2753 + }, + { + "completion_length": 853.1897735595703, + "epoch": 0.8226420730341274, + "grad_norm": 1.6158971786499023, + "kl": 0.5576171875, + "learning_rate": 1.833665846762702e-07, + "loss": 0.0294, + "reward": 1.0758928954601288, + "reward_std": 0.21099094115197659, + "rewards/accuracy_reward": 0.10714286286383867, + "rewards/format_reward": 0.9687500596046448, + "step": 2754 + }, + { + "completion_length": 887.3036041259766, + "epoch": 0.8229407811216489, + "grad_norm": 1.5283478498458862, + "kl": 0.54248046875, + "learning_rate": 1.830946365721049e-07, + "loss": 0.0283, + "reward": 1.1651785969734192, + "reward_std": 0.19145714864134789, + "rewards/accuracy_reward": 0.20089287497103214, + "rewards/format_reward": 0.9642857611179352, + "step": 2755 + }, + { + "completion_length": 863.1964721679688, + "epoch": 0.8232394892091703, + "grad_norm": 1.2515146732330322, + "kl": 0.4638671875, + "learning_rate": 1.8282308762501425e-07, + "loss": 0.0242, + "reward": 1.1919643580913544, + "reward_std": 0.2953328415751457, + "rewards/accuracy_reward": 0.2254464402794838, + "rewards/format_reward": 0.9665178954601288, + "step": 2756 + }, + { + "completion_length": 859.3995819091797, + "epoch": 0.8235381972966918, + "grad_norm": 2.6075055599212646, + "kl": 0.65966796875, + "learning_rate": 1.8255193813041707e-07, + "loss": 0.0366, + "reward": 1.1562500596046448, + "reward_std": 0.22188277915120125, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9598214775323868, + "step": 2757 + }, + { + "completion_length": 809.9152221679688, + "epoch": 0.8238369053842133, + "grad_norm": 1.7297018766403198, + "kl": 0.7998046875, + "learning_rate": 1.8228118838329759e-07, + "loss": 0.0266, + "reward": 1.0870535969734192, + "reward_std": 0.22613956406712532, + "rewards/accuracy_reward": 0.11830358020961285, + "rewards/format_reward": 0.9687500447034836, + "step": 2758 + }, + { + "completion_length": 982.9286041259766, + "epoch": 0.8241356134717347, + "grad_norm": 1.5264745950698853, + "kl": 0.64111328125, + "learning_rate": 1.8201083867820472e-07, + "loss": 0.0379, + "reward": 1.1428571939468384, + "reward_std": 0.2303539365530014, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.957589328289032, + "step": 2759 + }, + { + "completion_length": 877.5692291259766, + "epoch": 0.8244343215592562, + "grad_norm": 1.5021882057189941, + "kl": 0.48193359375, + "learning_rate": 1.8174088930925253e-07, + "loss": 0.0236, + "reward": 1.223214328289032, + "reward_std": 0.2505479995161295, + "rewards/accuracy_reward": 0.2566964377183467, + "rewards/format_reward": 0.9665178954601288, + "step": 2760 + }, + { + "completion_length": 976.3861999511719, + "epoch": 0.8247330296467776, + "grad_norm": 1.1986840963363647, + "kl": 0.56103515625, + "learning_rate": 1.8147134057011963e-07, + "loss": -0.005, + "reward": 1.0892857611179352, + "reward_std": 0.23879427090287209, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.9598214626312256, + "step": 2761 + }, + { + "completion_length": 900.7544860839844, + "epoch": 0.8250317377342992, + "grad_norm": 0.7732748985290527, + "kl": 0.744140625, + "learning_rate": 1.8120219275404836e-07, + "loss": 0.0276, + "reward": 1.069196492433548, + "reward_std": 0.20396763458848, + "rewards/accuracy_reward": 0.11160715040750802, + "rewards/format_reward": 0.9575893133878708, + "step": 2762 + }, + { + "completion_length": 897.6183471679688, + "epoch": 0.8253304458218206, + "grad_norm": 1.129333734512329, + "kl": 0.55419921875, + "learning_rate": 1.809334461538454e-07, + "loss": 0.0625, + "reward": 1.053571492433548, + "reward_std": 0.23162442818284035, + "rewards/accuracy_reward": 0.09151786379516125, + "rewards/format_reward": 0.9620536118745804, + "step": 2763 + }, + { + "completion_length": 921.5112152099609, + "epoch": 0.8256291539093421, + "grad_norm": 1.4001095294952393, + "kl": 0.662109375, + "learning_rate": 1.8066510106188055e-07, + "loss": 0.0227, + "reward": 1.1696429252624512, + "reward_std": 0.22562123835086823, + "rewards/accuracy_reward": 0.2098214328289032, + "rewards/format_reward": 0.9598214775323868, + "step": 2764 + }, + { + "completion_length": 882.1406707763672, + "epoch": 0.8259278619968635, + "grad_norm": 1.0342103242874146, + "kl": 0.52587890625, + "learning_rate": 1.80397157770087e-07, + "loss": 0.0307, + "reward": 1.0937500447034836, + "reward_std": 0.1570274941623211, + "rewards/accuracy_reward": 0.12500000861473382, + "rewards/format_reward": 0.9687500298023224, + "step": 2765 + }, + { + "completion_length": 790.8437805175781, + "epoch": 0.826226570084385, + "grad_norm": 0.7907979488372803, + "kl": 0.6123046875, + "learning_rate": 1.8012961656996095e-07, + "loss": 0.0474, + "reward": 1.1339285969734192, + "reward_std": 0.1771609839051962, + "rewards/accuracy_reward": 0.15848215483129025, + "rewards/format_reward": 0.9754464626312256, + "step": 2766 + }, + { + "completion_length": 969.872802734375, + "epoch": 0.8265252781719065, + "grad_norm": 1.2315876483917236, + "kl": 0.8798828125, + "learning_rate": 1.7986247775256078e-07, + "loss": 0.0704, + "reward": 1.069196492433548, + "reward_std": 0.27221912890672684, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.957589328289032, + "step": 2767 + }, + { + "completion_length": 1042.3437805175781, + "epoch": 0.826823986259428, + "grad_norm": 2.120345115661621, + "kl": 0.806640625, + "learning_rate": 1.7959574160850744e-07, + "loss": 0.0174, + "reward": 1.0647321939468384, + "reward_std": 0.2536979913711548, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.9553571939468384, + "step": 2768 + }, + { + "completion_length": 872.4420013427734, + "epoch": 0.8271226943469494, + "grad_norm": 0.966094970703125, + "kl": 0.61767578125, + "learning_rate": 1.793294084279838e-07, + "loss": 0.0051, + "reward": 1.1674107611179352, + "reward_std": 0.16439638659358025, + "rewards/accuracy_reward": 0.1964285783469677, + "rewards/format_reward": 0.9709821790456772, + "step": 2769 + }, + { + "completion_length": 854.6361999511719, + "epoch": 0.8274214024344709, + "grad_norm": 0.7006591558456421, + "kl": 0.69921875, + "learning_rate": 1.7906347850073404e-07, + "loss": 0.0052, + "reward": 1.1495535969734192, + "reward_std": 0.2115786224603653, + "rewards/accuracy_reward": 0.17633929522708058, + "rewards/format_reward": 0.973214328289032, + "step": 2770 + }, + { + "completion_length": 869.9955749511719, + "epoch": 0.8277201105219923, + "grad_norm": 1.766331434249878, + "kl": 0.8603515625, + "learning_rate": 1.7879795211606402e-07, + "loss": -0.0058, + "reward": 1.131696492433548, + "reward_std": 0.2557273209095001, + "rewards/accuracy_reward": 0.17633929289877415, + "rewards/format_reward": 0.9553571790456772, + "step": 2771 + }, + { + "completion_length": 872.7879791259766, + "epoch": 0.8280188186095139, + "grad_norm": 2.3030519485473633, + "kl": 0.7001953125, + "learning_rate": 1.785328295628405e-07, + "loss": 0.0808, + "reward": 1.1428571790456772, + "reward_std": 0.2369372881948948, + "rewards/accuracy_reward": 0.18526786309666932, + "rewards/format_reward": 0.9575893431901932, + "step": 2772 + }, + { + "completion_length": 849.6339874267578, + "epoch": 0.8283175266970353, + "grad_norm": 1.5831831693649292, + "kl": 0.7685546875, + "learning_rate": 1.7826811112949058e-07, + "loss": -0.0132, + "reward": 1.1741071939468384, + "reward_std": 0.29334499686956406, + "rewards/accuracy_reward": 0.2254464365541935, + "rewards/format_reward": 0.948660746216774, + "step": 2773 + }, + { + "completion_length": 874.8616333007812, + "epoch": 0.8286162347845568, + "grad_norm": 1.4609225988388062, + "kl": 0.7021484375, + "learning_rate": 1.7800379710400213e-07, + "loss": -0.017, + "reward": 1.1651786267757416, + "reward_std": 0.32398808747529984, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.9486607611179352, + "step": 2774 + }, + { + "completion_length": 834.6562957763672, + "epoch": 0.8289149428720782, + "grad_norm": 1.19111967086792, + "kl": 0.748046875, + "learning_rate": 1.77739887773923e-07, + "loss": 0.0236, + "reward": 1.1875000596046448, + "reward_std": 0.27841050550341606, + "rewards/accuracy_reward": 0.22544643841683865, + "rewards/format_reward": 0.9620535969734192, + "step": 2775 + }, + { + "completion_length": 836.7879791259766, + "epoch": 0.8292136509595998, + "grad_norm": 1.336730718612671, + "kl": 0.779296875, + "learning_rate": 1.7747638342636042e-07, + "loss": 0.0182, + "reward": 1.207589328289032, + "reward_std": 0.22432401403784752, + "rewards/accuracy_reward": 0.2522321492433548, + "rewards/format_reward": 0.9553571790456772, + "step": 2776 + }, + { + "completion_length": 924.9107360839844, + "epoch": 0.8295123590471212, + "grad_norm": 2.2149641513824463, + "kl": 0.84375, + "learning_rate": 1.772132843479816e-07, + "loss": 0.0739, + "reward": 1.066964328289032, + "reward_std": 0.2514338009059429, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.95089291036129, + "step": 2777 + }, + { + "completion_length": 818.3750305175781, + "epoch": 0.8298110671346427, + "grad_norm": 1.4609544277191162, + "kl": 0.9404296875, + "learning_rate": 1.7695059082501224e-07, + "loss": 0.0689, + "reward": 1.2165179252624512, + "reward_std": 0.26861409470438957, + "rewards/accuracy_reward": 0.2633928656578064, + "rewards/format_reward": 0.9531250298023224, + "step": 2778 + }, + { + "completion_length": 859.8705749511719, + "epoch": 0.8301097752221641, + "grad_norm": 1.3215135335922241, + "kl": 1.0693359375, + "learning_rate": 1.7668830314323726e-07, + "loss": 0.0251, + "reward": 1.073660746216774, + "reward_std": 0.2997182346880436, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.9531250447034836, + "step": 2779 + }, + { + "completion_length": 901.9866333007812, + "epoch": 0.8304084833096856, + "grad_norm": 1.334228515625, + "kl": 0.591796875, + "learning_rate": 1.7642642158800015e-07, + "loss": -0.0105, + "reward": 1.064732164144516, + "reward_std": 0.222100168466568, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.9665178954601288, + "step": 2780 + }, + { + "completion_length": 857.4196929931641, + "epoch": 0.8307071913972071, + "grad_norm": 1.0379297733306885, + "kl": 0.69189453125, + "learning_rate": 1.7616494644420208e-07, + "loss": -0.0022, + "reward": 1.147321492433548, + "reward_std": 0.24881012365221977, + "rewards/accuracy_reward": 0.18303571827709675, + "rewards/format_reward": 0.964285746216774, + "step": 2781 + }, + { + "completion_length": 980.7433471679688, + "epoch": 0.8310058994847286, + "grad_norm": 1.83580482006073, + "kl": 0.986328125, + "learning_rate": 1.7590387799630246e-07, + "loss": 0.0299, + "reward": 1.0937500596046448, + "reward_std": 0.26190174743533134, + "rewards/accuracy_reward": 0.14285714644938707, + "rewards/format_reward": 0.95089291036129, + "step": 2782 + }, + { + "completion_length": 872.3951263427734, + "epoch": 0.83130460757225, + "grad_norm": 1.5170706510543823, + "kl": 0.6787109375, + "learning_rate": 1.7564321652831827e-07, + "loss": 0.0312, + "reward": 1.1450893580913544, + "reward_std": 0.1744727324694395, + "rewards/accuracy_reward": 0.16964286309666932, + "rewards/format_reward": 0.9754464775323868, + "step": 2783 + }, + { + "completion_length": 930.2098541259766, + "epoch": 0.8316033156597715, + "grad_norm": 0.9469956755638123, + "kl": 0.6494140625, + "learning_rate": 1.7538296232382355e-07, + "loss": 0.0226, + "reward": 1.035714328289032, + "reward_std": 0.1747051514685154, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.9732143133878708, + "step": 2784 + }, + { + "completion_length": 857.7924499511719, + "epoch": 0.8319020237472929, + "grad_norm": 1.2322325706481934, + "kl": 0.8583984375, + "learning_rate": 1.7512311566594955e-07, + "loss": 0.0632, + "reward": 1.1674107611179352, + "reward_std": 0.2169646881520748, + "rewards/accuracy_reward": 0.20089287124574184, + "rewards/format_reward": 0.9665178954601288, + "step": 2785 + }, + { + "completion_length": 810.5513763427734, + "epoch": 0.8322007318348145, + "grad_norm": 1.7663246393203735, + "kl": 0.87060546875, + "learning_rate": 1.7486367683738375e-07, + "loss": 0.0351, + "reward": 1.1361607611179352, + "reward_std": 0.25060633569955826, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.9464285969734192, + "step": 2786 + }, + { + "completion_length": 983.2500305175781, + "epoch": 0.8324994399223359, + "grad_norm": 1.6166571378707886, + "kl": 0.9716796875, + "learning_rate": 1.746046461203701e-07, + "loss": 0.082, + "reward": 1.1361607611179352, + "reward_std": 0.26455773785710335, + "rewards/accuracy_reward": 0.18526786752045155, + "rewards/format_reward": 0.9508928805589676, + "step": 2787 + }, + { + "completion_length": 851.7991485595703, + "epoch": 0.8327981480098574, + "grad_norm": 1.1526228189468384, + "kl": 0.64892578125, + "learning_rate": 1.74346023796709e-07, + "loss": 0.0217, + "reward": 1.0714286267757416, + "reward_std": 0.20778468810021877, + "rewards/accuracy_reward": 0.11160715017467737, + "rewards/format_reward": 0.9598214775323868, + "step": 2788 + }, + { + "completion_length": 938.2678985595703, + "epoch": 0.8330968560973788, + "grad_norm": 1.3627876043319702, + "kl": 0.86328125, + "learning_rate": 1.740878101477558e-07, + "loss": 0.0328, + "reward": 1.1138393580913544, + "reward_std": 0.25762293860316277, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9553571790456772, + "step": 2789 + }, + { + "completion_length": 780.1027069091797, + "epoch": 0.8333955641849004, + "grad_norm": 1.1647509336471558, + "kl": 0.6826171875, + "learning_rate": 1.7383000545442188e-07, + "loss": 0.0755, + "reward": 1.1562500596046448, + "reward_std": 0.2126958342269063, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.9732143431901932, + "step": 2790 + }, + { + "completion_length": 904.4308319091797, + "epoch": 0.8336942722724218, + "grad_norm": 1.0726505517959595, + "kl": 0.79052734375, + "learning_rate": 1.7357260999717343e-07, + "loss": 0.0255, + "reward": 1.131696492433548, + "reward_std": 0.23280581831932068, + "rewards/accuracy_reward": 0.1830357164144516, + "rewards/format_reward": 0.9486607611179352, + "step": 2791 + }, + { + "completion_length": 960.0245819091797, + "epoch": 0.8339929803599433, + "grad_norm": 1.1851762533187866, + "kl": 0.65576171875, + "learning_rate": 1.733156240560314e-07, + "loss": 0.037, + "reward": 1.098214328289032, + "reward_std": 0.21967268362641335, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9665178954601288, + "step": 2792 + }, + { + "completion_length": 915.2098693847656, + "epoch": 0.8342916884474647, + "grad_norm": 1.824832558631897, + "kl": 1.0751953125, + "learning_rate": 1.7305904791057135e-07, + "loss": 0.0862, + "reward": 1.0736607611179352, + "reward_std": 0.19998471066355705, + "rewards/accuracy_reward": 0.10714286286383867, + "rewards/format_reward": 0.96651791036129, + "step": 2793 + }, + { + "completion_length": 880.966552734375, + "epoch": 0.8345903965349862, + "grad_norm": 1.535662293434143, + "kl": 0.8671875, + "learning_rate": 1.7280288183992307e-07, + "loss": 0.0259, + "reward": 1.1540179252624512, + "reward_std": 0.22460211254656315, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.964285746216774, + "step": 2794 + }, + { + "completion_length": 858.9241485595703, + "epoch": 0.8348891046225076, + "grad_norm": 1.2821969985961914, + "kl": 0.693359375, + "learning_rate": 1.7254712612276998e-07, + "loss": 0.0671, + "reward": 1.1674107611179352, + "reward_std": 0.2358027584850788, + "rewards/accuracy_reward": 0.1919642984867096, + "rewards/format_reward": 0.9754464775323868, + "step": 2795 + }, + { + "completion_length": 960.8928833007812, + "epoch": 0.8351878127100292, + "grad_norm": 1.3848309516906738, + "kl": 0.79296875, + "learning_rate": 1.7229178103734943e-07, + "loss": -0.0028, + "reward": 1.10714291036129, + "reward_std": 0.2088138461112976, + "rewards/accuracy_reward": 0.15848215040750802, + "rewards/format_reward": 0.9486607611179352, + "step": 2796 + }, + { + "completion_length": 848.3281707763672, + "epoch": 0.8354865207975506, + "grad_norm": 1.6382941007614136, + "kl": 0.84375, + "learning_rate": 1.7203684686145156e-07, + "loss": 0.0202, + "reward": 1.1294643580913544, + "reward_std": 0.24006951972842216, + "rewards/accuracy_reward": 0.16294643376022577, + "rewards/format_reward": 0.9665178954601288, + "step": 2797 + }, + { + "completion_length": 843.0045013427734, + "epoch": 0.8357852288850721, + "grad_norm": 1.5295467376708984, + "kl": 0.5146484375, + "learning_rate": 1.7178232387241998e-07, + "loss": 0.0439, + "reward": 1.2120536267757416, + "reward_std": 0.2793840765953064, + "rewards/accuracy_reward": 0.2477678656578064, + "rewards/format_reward": 0.9642857760190964, + "step": 2798 + }, + { + "completion_length": 882.7433471679688, + "epoch": 0.8360839369725935, + "grad_norm": 0.9080928564071655, + "kl": 0.53564453125, + "learning_rate": 1.715282123471508e-07, + "loss": 0.0181, + "reward": 1.2678571939468384, + "reward_std": 0.2477221041917801, + "rewards/accuracy_reward": 0.3080357350409031, + "rewards/format_reward": 0.9598214626312256, + "step": 2799 + }, + { + "completion_length": 890.9911041259766, + "epoch": 0.836382645060115, + "grad_norm": 1.9749078750610352, + "kl": 0.44873046875, + "learning_rate": 1.7127451256209226e-07, + "loss": 0.0364, + "reward": 1.1339285969734192, + "reward_std": 0.2426307536661625, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9709821939468384, + "step": 2800 + }, + { + "completion_length": 867.4464721679688, + "epoch": 0.8366813531476365, + "grad_norm": 0.9842427968978882, + "kl": 0.79931640625, + "learning_rate": 1.7102122479324495e-07, + "loss": 0.0261, + "reward": 1.3638393580913544, + "reward_std": 0.24757397174835205, + "rewards/accuracy_reward": 0.3906250149011612, + "rewards/format_reward": 0.973214328289032, + "step": 2801 + }, + { + "completion_length": 896.2187957763672, + "epoch": 0.8369800612351579, + "grad_norm": 0.7378154397010803, + "kl": 0.5556640625, + "learning_rate": 1.707683493161613e-07, + "loss": 0.0592, + "reward": 1.1875000596046448, + "reward_std": 0.21225174516439438, + "rewards/accuracy_reward": 0.227678582072258, + "rewards/format_reward": 0.9598214775323868, + "step": 2802 + }, + { + "completion_length": 918.4129943847656, + "epoch": 0.8372787693226794, + "grad_norm": 1.576168179512024, + "kl": 0.619140625, + "learning_rate": 1.7051588640594477e-07, + "loss": 0.0059, + "reward": 1.1227679252624512, + "reward_std": 0.1852349229156971, + "rewards/accuracy_reward": 0.15178572107106447, + "rewards/format_reward": 0.9709821790456772, + "step": 2803 + }, + { + "completion_length": 873.8326263427734, + "epoch": 0.8375774774102008, + "grad_norm": 2.4413554668426514, + "kl": 0.4775390625, + "learning_rate": 1.7026383633725039e-07, + "loss": 0.0528, + "reward": 1.116071492433548, + "reward_std": 0.22735081985592842, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9687500447034836, + "step": 2804 + }, + { + "completion_length": 868.8928833007812, + "epoch": 0.8378761854977224, + "grad_norm": 1.4330244064331055, + "kl": 0.6513671875, + "learning_rate": 1.70012199384284e-07, + "loss": 0.029, + "reward": 1.1964285969734192, + "reward_std": 0.18493074923753738, + "rewards/accuracy_reward": 0.22098215483129025, + "rewards/format_reward": 0.9754464626312256, + "step": 2805 + }, + { + "completion_length": 880.9152069091797, + "epoch": 0.8381748935852438, + "grad_norm": 1.186423420906067, + "kl": 0.68310546875, + "learning_rate": 1.6976097582080184e-07, + "loss": 0.024, + "reward": 1.2209821939468384, + "reward_std": 0.27694046869874, + "rewards/accuracy_reward": 0.2611607313156128, + "rewards/format_reward": 0.9598214775323868, + "step": 2806 + }, + { + "completion_length": 787.1518249511719, + "epoch": 0.8384736016727653, + "grad_norm": 1.3134387731552124, + "kl": 0.5107421875, + "learning_rate": 1.6951016592011053e-07, + "loss": 0.0358, + "reward": 1.1540178954601288, + "reward_std": 0.19451243802905083, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.9687500447034836, + "step": 2807 + }, + { + "completion_length": 786.9397735595703, + "epoch": 0.8387723097602867, + "grad_norm": 0.7974718809127808, + "kl": 0.6865234375, + "learning_rate": 1.6925976995506674e-07, + "loss": 0.0577, + "reward": 1.1339286267757416, + "reward_std": 0.22482243180274963, + "rewards/accuracy_reward": 0.16741072572767735, + "rewards/format_reward": 0.96651791036129, + "step": 2808 + }, + { + "completion_length": 911.1964721679688, + "epoch": 0.8390710178478082, + "grad_norm": 2.995476722717285, + "kl": 0.7080078125, + "learning_rate": 1.6900978819807664e-07, + "loss": 0.0734, + "reward": 1.145089328289032, + "reward_std": 0.23667947202920914, + "rewards/accuracy_reward": 0.17410715017467737, + "rewards/format_reward": 0.9709821790456772, + "step": 2809 + }, + { + "completion_length": 873.1964569091797, + "epoch": 0.8393697259353297, + "grad_norm": 2.082545757293701, + "kl": 0.837890625, + "learning_rate": 1.6876022092109604e-07, + "loss": 0.0822, + "reward": 1.162946492433548, + "reward_std": 0.18062766641378403, + "rewards/accuracy_reward": 0.19419644214212894, + "rewards/format_reward": 0.9687500447034836, + "step": 2810 + }, + { + "completion_length": 946.5022735595703, + "epoch": 0.8396684340228512, + "grad_norm": 0.9605470895767212, + "kl": 0.57080078125, + "learning_rate": 1.685110683956294e-07, + "loss": 0.0026, + "reward": 1.1540179252624512, + "reward_std": 0.24527663737535477, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.973214328289032, + "step": 2811 + }, + { + "completion_length": 837.919677734375, + "epoch": 0.8399671421103726, + "grad_norm": 1.8391764163970947, + "kl": 0.7578125, + "learning_rate": 1.6826233089273046e-07, + "loss": 0.0233, + "reward": 1.1517857313156128, + "reward_std": 0.1829164894297719, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9687500447034836, + "step": 2812 + }, + { + "completion_length": 874.6674499511719, + "epoch": 0.8402658501978941, + "grad_norm": 1.9821882247924805, + "kl": 0.78125, + "learning_rate": 1.680140086830013e-07, + "loss": 0.0515, + "reward": 1.0424107611179352, + "reward_std": 0.21011393517255783, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.9531250447034836, + "step": 2813 + }, + { + "completion_length": 937.3638763427734, + "epoch": 0.8405645582854155, + "grad_norm": 1.6540412902832031, + "kl": 0.73095703125, + "learning_rate": 1.6776610203659192e-07, + "loss": 0.0525, + "reward": 1.1071428805589676, + "reward_std": 0.188679501414299, + "rewards/accuracy_reward": 0.1450892873108387, + "rewards/format_reward": 0.9620536118745804, + "step": 2814 + }, + { + "completion_length": 879.5692291259766, + "epoch": 0.8408632663729371, + "grad_norm": 1.8688383102416992, + "kl": 0.84765625, + "learning_rate": 1.675186112232006e-07, + "loss": 0.1044, + "reward": 1.035714328289032, + "reward_std": 0.21766164153814316, + "rewards/accuracy_reward": 0.07589285913854837, + "rewards/format_reward": 0.9598214775323868, + "step": 2815 + }, + { + "completion_length": 837.1295166015625, + "epoch": 0.8411619744604585, + "grad_norm": 2.953019618988037, + "kl": 2.1787109375, + "learning_rate": 1.6727153651207313e-07, + "loss": 0.0144, + "reward": 1.223214328289032, + "reward_std": 0.25219283252954483, + "rewards/accuracy_reward": 0.2544642947614193, + "rewards/format_reward": 0.9687500447034836, + "step": 2816 + }, + { + "completion_length": 907.4866333007812, + "epoch": 0.84146068254798, + "grad_norm": 1.2005929946899414, + "kl": 0.6767578125, + "learning_rate": 1.6702487817200238e-07, + "loss": -0.0198, + "reward": 1.1607143580913544, + "reward_std": 0.22549035772681236, + "rewards/accuracy_reward": 0.1964285857975483, + "rewards/format_reward": 0.964285746216774, + "step": 2817 + }, + { + "completion_length": 844.3326263427734, + "epoch": 0.8417593906355014, + "grad_norm": 1.3630425930023193, + "kl": 0.6484375, + "learning_rate": 1.6677863647132867e-07, + "loss": 0.0589, + "reward": 1.104910746216774, + "reward_std": 0.24911786802113056, + "rewards/accuracy_reward": 0.14732143632136285, + "rewards/format_reward": 0.957589328289032, + "step": 2818 + }, + { + "completion_length": 903.8772735595703, + "epoch": 0.842058098723023, + "grad_norm": 1.5796706676483154, + "kl": 0.96240234375, + "learning_rate": 1.665328116779388e-07, + "loss": 0.0559, + "reward": 1.095982164144516, + "reward_std": 0.22428016364574432, + "rewards/accuracy_reward": 0.14062500977888703, + "rewards/format_reward": 0.9553571939468384, + "step": 2819 + }, + { + "completion_length": 960.8370971679688, + "epoch": 0.8423568068105444, + "grad_norm": 1.0426994562149048, + "kl": 0.54736328125, + "learning_rate": 1.6628740405926594e-07, + "loss": 0.0185, + "reward": 1.084821492433548, + "reward_std": 0.17315761744976044, + "rewards/accuracy_reward": 0.11830357508733869, + "rewards/format_reward": 0.96651791036129, + "step": 2820 + }, + { + "completion_length": 962.5647888183594, + "epoch": 0.8426555148980659, + "grad_norm": 1.1408697366714478, + "kl": 0.8310546875, + "learning_rate": 1.6604241388228954e-07, + "loss": 0.0719, + "reward": 1.1741072088479996, + "reward_std": 0.2852811627089977, + "rewards/accuracy_reward": 0.22321430314332247, + "rewards/format_reward": 0.9508928954601288, + "step": 2821 + }, + { + "completion_length": 900.9598541259766, + "epoch": 0.8429542229855873, + "grad_norm": 1.881281852722168, + "kl": 0.6611328125, + "learning_rate": 1.6579784141353508e-07, + "loss": 0.051, + "reward": 1.1718750596046448, + "reward_std": 0.2568732015788555, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9709821790456772, + "step": 2822 + }, + { + "completion_length": 770.5848541259766, + "epoch": 0.8432529310731088, + "grad_norm": 1.1719833612442017, + "kl": 0.75390625, + "learning_rate": 1.6555368691907318e-07, + "loss": 0.0329, + "reward": 1.2589286267757416, + "reward_std": 0.26236409321427345, + "rewards/accuracy_reward": 0.2946428768336773, + "rewards/format_reward": 0.9642857760190964, + "step": 2823 + }, + { + "completion_length": 804.9174346923828, + "epoch": 0.8435516391606303, + "grad_norm": 1.4817566871643066, + "kl": 0.7900390625, + "learning_rate": 1.6530995066452024e-07, + "loss": 0.0279, + "reward": 1.1361607611179352, + "reward_std": 0.19750957190990448, + "rewards/accuracy_reward": 0.17410714784637094, + "rewards/format_reward": 0.9620536118745804, + "step": 2824 + }, + { + "completion_length": 798.8839569091797, + "epoch": 0.8438503472481518, + "grad_norm": 1.4412972927093506, + "kl": 0.828125, + "learning_rate": 1.650666329150372e-07, + "loss": 0.0228, + "reward": 1.2120536267757416, + "reward_std": 0.19212611205875874, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.9754464775323868, + "step": 2825 + }, + { + "completion_length": 830.638427734375, + "epoch": 0.8441490553356732, + "grad_norm": 1.7292615175247192, + "kl": 0.66845703125, + "learning_rate": 1.6482373393533e-07, + "loss": 0.0404, + "reward": 1.0892857760190964, + "reward_std": 0.17759856022894382, + "rewards/accuracy_reward": 0.12276785937137902, + "rewards/format_reward": 0.9665178954601288, + "step": 2826 + }, + { + "completion_length": 890.2053985595703, + "epoch": 0.8444477634231947, + "grad_norm": 1.0571588277816772, + "kl": 0.6123046875, + "learning_rate": 1.6458125398964908e-07, + "loss": 0.0074, + "reward": 1.305803656578064, + "reward_std": 0.3028664253652096, + "rewards/accuracy_reward": 0.3392857201397419, + "rewards/format_reward": 0.96651791036129, + "step": 2827 + }, + { + "completion_length": 870.263427734375, + "epoch": 0.8447464715107161, + "grad_norm": 1.612282633781433, + "kl": 0.720703125, + "learning_rate": 1.643391933417886e-07, + "loss": 0.024, + "reward": 1.104910746216774, + "reward_std": 0.19097217544913292, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.9598214626312256, + "step": 2828 + }, + { + "completion_length": 962.9018402099609, + "epoch": 0.8450451795982377, + "grad_norm": 0.9006662964820862, + "kl": 0.650390625, + "learning_rate": 1.6409755225508697e-07, + "loss": 0.0196, + "reward": 1.069196492433548, + "reward_std": 0.1759709119796753, + "rewards/accuracy_reward": 0.10267857881262898, + "rewards/format_reward": 0.9665178805589676, + "step": 2829 + }, + { + "completion_length": 833.2768096923828, + "epoch": 0.8453438876857591, + "grad_norm": 1.6334824562072754, + "kl": 0.8525390625, + "learning_rate": 1.63856330992426e-07, + "loss": 0.0287, + "reward": 1.205357164144516, + "reward_std": 0.2181013748049736, + "rewards/accuracy_reward": 0.22991071827709675, + "rewards/format_reward": 0.9754464775323868, + "step": 2830 + }, + { + "completion_length": 947.6830902099609, + "epoch": 0.8456425957732806, + "grad_norm": 1.3998394012451172, + "kl": 0.8564453125, + "learning_rate": 1.636155298162308e-07, + "loss": 0.0589, + "reward": 1.0312500447034836, + "reward_std": 0.19333508238196373, + "rewards/accuracy_reward": 0.07142857694998384, + "rewards/format_reward": 0.9598214626312256, + "step": 2831 + }, + { + "completion_length": 867.1250305175781, + "epoch": 0.845941303860802, + "grad_norm": 1.9562195539474487, + "kl": 0.873046875, + "learning_rate": 1.6337514898846932e-07, + "loss": 0.0228, + "reward": 1.160714328289032, + "reward_std": 0.2771221064031124, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.9441964626312256, + "step": 2832 + }, + { + "completion_length": 797.4687805175781, + "epoch": 0.8462400119483235, + "grad_norm": 1.190738558769226, + "kl": 0.75732421875, + "learning_rate": 1.6313518877065255e-07, + "loss": -0.0036, + "reward": 1.1674107611179352, + "reward_std": 0.2375970184803009, + "rewards/accuracy_reward": 0.21205358067527413, + "rewards/format_reward": 0.9553571939468384, + "step": 2833 + }, + { + "completion_length": 871.9553985595703, + "epoch": 0.846538720035845, + "grad_norm": 2.174420118331909, + "kl": 0.65283203125, + "learning_rate": 1.628956494238335e-07, + "loss": 0.0135, + "reward": 1.0736607760190964, + "reward_std": 0.27456385269761086, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.9531250447034836, + "step": 2834 + }, + { + "completion_length": 937.1585235595703, + "epoch": 0.8468374281233665, + "grad_norm": 1.6075692176818848, + "kl": 0.63671875, + "learning_rate": 1.626565312086075e-07, + "loss": 0.0044, + "reward": 1.1004464626312256, + "reward_std": 0.23674758151173592, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9486607611179352, + "step": 2835 + }, + { + "completion_length": 902.8683471679688, + "epoch": 0.8471361362108879, + "grad_norm": 0.9623729586601257, + "kl": 0.65234375, + "learning_rate": 1.6241783438511197e-07, + "loss": 0.0013, + "reward": 1.0111607611179352, + "reward_std": 0.2148426929488778, + "rewards/accuracy_reward": 0.06473214412108064, + "rewards/format_reward": 0.9464286118745804, + "step": 2836 + }, + { + "completion_length": 871.966552734375, + "epoch": 0.8474348442984094, + "grad_norm": 2.6926653385162354, + "kl": 0.861328125, + "learning_rate": 1.6217955921302537e-07, + "loss": 0.0448, + "reward": 1.0915178954601288, + "reward_std": 0.27204861119389534, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9352678954601288, + "step": 2837 + }, + { + "completion_length": 946.3013916015625, + "epoch": 0.8477335523859308, + "grad_norm": 1.2888143062591553, + "kl": 0.6181640625, + "learning_rate": 1.6194170595156798e-07, + "loss": 0.0274, + "reward": 1.236607164144516, + "reward_std": 0.23787900432944298, + "rewards/accuracy_reward": 0.27008930407464504, + "rewards/format_reward": 0.9665178954601288, + "step": 2838 + }, + { + "completion_length": 916.5915679931641, + "epoch": 0.8480322604734524, + "grad_norm": 3.0913498401641846, + "kl": 0.6201171875, + "learning_rate": 1.6170427485950055e-07, + "loss": 0.0286, + "reward": 1.2254464626312256, + "reward_std": 0.2714087516069412, + "rewards/accuracy_reward": 0.2611607275903225, + "rewards/format_reward": 0.9642857611179352, + "step": 2839 + }, + { + "completion_length": 926.0692443847656, + "epoch": 0.8483309685609738, + "grad_norm": 2.1229748725891113, + "kl": 0.681640625, + "learning_rate": 1.6146726619512504e-07, + "loss": 0.0437, + "reward": 1.0825893431901932, + "reward_std": 0.1817437708377838, + "rewards/accuracy_reward": 0.10937500512227416, + "rewards/format_reward": 0.973214328289032, + "step": 2840 + }, + { + "completion_length": 906.0402221679688, + "epoch": 0.8486296766484953, + "grad_norm": 0.6168565154075623, + "kl": 0.71044921875, + "learning_rate": 1.6123068021628375e-07, + "loss": 0.0325, + "reward": 1.194196492433548, + "reward_std": 0.22199420630931854, + "rewards/accuracy_reward": 0.223214291036129, + "rewards/format_reward": 0.9709821939468384, + "step": 2841 + }, + { + "completion_length": 896.0491485595703, + "epoch": 0.8489283847360167, + "grad_norm": 1.1383750438690186, + "kl": 0.8525390625, + "learning_rate": 1.6099451718035874e-07, + "loss": 0.0568, + "reward": 1.127232164144516, + "reward_std": 0.18122976645827293, + "rewards/accuracy_reward": 0.16517857741564512, + "rewards/format_reward": 0.9620536118745804, + "step": 2842 + }, + { + "completion_length": 919.0156860351562, + "epoch": 0.8492270928235381, + "grad_norm": 2.6795246601104736, + "kl": 0.484375, + "learning_rate": 1.6075877734427247e-07, + "loss": 0.0262, + "reward": 1.1964286267757416, + "reward_std": 0.22595332190394402, + "rewards/accuracy_reward": 0.2410714328289032, + "rewards/format_reward": 0.9553571790456772, + "step": 2843 + }, + { + "completion_length": 879.0089569091797, + "epoch": 0.8495258009110597, + "grad_norm": 1.6831647157669067, + "kl": 0.560546875, + "learning_rate": 1.6052346096448673e-07, + "loss": 0.0521, + "reward": 1.2098215222358704, + "reward_std": 0.23253164626657963, + "rewards/accuracy_reward": 0.2343750074505806, + "rewards/format_reward": 0.975446492433548, + "step": 2844 + }, + { + "completion_length": 898.3348693847656, + "epoch": 0.8498245089985811, + "grad_norm": 1.1040444374084473, + "kl": 0.640625, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0079, + "reward": 1.1049107760190964, + "reward_std": 0.23356518149375916, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9508928805589676, + "step": 2845 + }, + { + "completion_length": 912.2634429931641, + "epoch": 0.8501232170861026, + "grad_norm": 0.6280046701431274, + "kl": 0.6533203125, + "learning_rate": 1.6005409959736035e-07, + "loss": 0.0069, + "reward": 1.0625000298023224, + "reward_std": 0.2230011522769928, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.964285746216774, + "step": 2846 + }, + { + "completion_length": 833.1518249511719, + "epoch": 0.850421925173624, + "grad_norm": 1.6156113147735596, + "kl": 0.7060546875, + "learning_rate": 1.59820055120639e-07, + "loss": 0.0286, + "reward": 1.1875000596046448, + "reward_std": 0.3156270682811737, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9553571790456772, + "step": 2847 + }, + { + "completion_length": 902.1897583007812, + "epoch": 0.8507206332611456, + "grad_norm": 7.277552604675293, + "kl": 0.9072265625, + "learning_rate": 1.5958643512145584e-07, + "loss": 0.0641, + "reward": 1.160714328289032, + "reward_std": 0.2015850469470024, + "rewards/accuracy_reward": 0.19419643376022577, + "rewards/format_reward": 0.9665178954601288, + "step": 2848 + }, + { + "completion_length": 833.8616333007812, + "epoch": 0.851019341348667, + "grad_norm": 1.2040578126907349, + "kl": 0.54296875, + "learning_rate": 1.5935323985396674e-07, + "loss": -0.0001, + "reward": 1.131696492433548, + "reward_std": 0.2249947190284729, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.9665178805589676, + "step": 2849 + }, + { + "completion_length": 907.5335235595703, + "epoch": 0.8513180494361885, + "grad_norm": 1.2062221765518188, + "kl": 0.54296875, + "learning_rate": 1.5912046957186507e-07, + "loss": -0.0027, + "reward": 1.209821492433548, + "reward_std": 0.24773672595620155, + "rewards/accuracy_reward": 0.2455357275903225, + "rewards/format_reward": 0.964285746216774, + "step": 2850 + }, + { + "completion_length": 863.4330749511719, + "epoch": 0.8516167575237099, + "grad_norm": 1.5925003290176392, + "kl": 0.9296875, + "learning_rate": 1.588881245283822e-07, + "loss": 0.0323, + "reward": 1.2075893580913544, + "reward_std": 0.25798854418098927, + "rewards/accuracy_reward": 0.2455357238650322, + "rewards/format_reward": 0.9620536118745804, + "step": 2851 + }, + { + "completion_length": 821.4955749511719, + "epoch": 0.8519154656112314, + "grad_norm": 1.8281793594360352, + "kl": 0.67333984375, + "learning_rate": 1.5865620497628683e-07, + "loss": 0.017, + "reward": 1.145089328289032, + "reward_std": 0.21834901347756386, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.9642857611179352, + "step": 2852 + }, + { + "completion_length": 960.5625457763672, + "epoch": 0.8522141736987529, + "grad_norm": 1.113348126411438, + "kl": 0.673828125, + "learning_rate": 1.5842471116788458e-07, + "loss": 0.0132, + "reward": 1.1495536267757416, + "reward_std": 0.2329932637512684, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.9464286118745804, + "step": 2853 + }, + { + "completion_length": 858.8348693847656, + "epoch": 0.8525128817862744, + "grad_norm": 1.0168771743774414, + "kl": 0.646484375, + "learning_rate": 1.5819364335501805e-07, + "loss": 0.0341, + "reward": 1.1629464328289032, + "reward_std": 0.27933669835329056, + "rewards/accuracy_reward": 0.1919642984867096, + "rewards/format_reward": 0.9709821790456772, + "step": 2854 + }, + { + "completion_length": 898.4643249511719, + "epoch": 0.8528115898737958, + "grad_norm": 1.0607346296310425, + "kl": 0.716796875, + "learning_rate": 1.579630017890665e-07, + "loss": 0.0524, + "reward": 1.1227678954601288, + "reward_std": 0.2251165397465229, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 2855 + }, + { + "completion_length": 944.3303833007812, + "epoch": 0.8531102979613173, + "grad_norm": 1.3789225816726685, + "kl": 0.7734375, + "learning_rate": 1.5773278672094515e-07, + "loss": 0.0717, + "reward": 1.165178656578064, + "reward_std": 0.21693093329668045, + "rewards/accuracy_reward": 0.20758929289877415, + "rewards/format_reward": 0.9575893133878708, + "step": 2856 + }, + { + "completion_length": 934.638427734375, + "epoch": 0.8534090060488387, + "grad_norm": 1.110405445098877, + "kl": 0.63671875, + "learning_rate": 1.5750299840110554e-07, + "loss": 0.0071, + "reward": 1.1183035969734192, + "reward_std": 0.26663728430867195, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9553571790456772, + "step": 2857 + }, + { + "completion_length": 908.3326263427734, + "epoch": 0.8537077141363603, + "grad_norm": 1.491316795349121, + "kl": 0.52587890625, + "learning_rate": 1.5727363707953495e-07, + "loss": 0.0059, + "reward": 1.0781250596046448, + "reward_std": 0.20155739784240723, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.9687500298023224, + "step": 2858 + }, + { + "completion_length": 863.1049346923828, + "epoch": 0.8540064222238817, + "grad_norm": 1.3200150728225708, + "kl": 0.6689453125, + "learning_rate": 1.5704470300575572e-07, + "loss": 0.0514, + "reward": 1.1026786267757416, + "reward_std": 0.24201859161257744, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.9464286118745804, + "step": 2859 + }, + { + "completion_length": 923.857177734375, + "epoch": 0.8543051303114032, + "grad_norm": 1.0041393041610718, + "kl": 0.701171875, + "learning_rate": 1.5681619642882593e-07, + "loss": 0.0549, + "reward": 1.0848214775323868, + "reward_std": 0.28157924860715866, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.9441964775323868, + "step": 2860 + }, + { + "completion_length": 847.1763763427734, + "epoch": 0.8546038383989246, + "grad_norm": 1.4904351234436035, + "kl": 0.78662109375, + "learning_rate": 1.5658811759733833e-07, + "loss": 0.0228, + "reward": 1.0959822088479996, + "reward_std": 0.15050717256963253, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9821428954601288, + "step": 2861 + }, + { + "completion_length": 844.6875457763672, + "epoch": 0.8549025464864461, + "grad_norm": 2.026512622833252, + "kl": 0.5869140625, + "learning_rate": 1.563604667594202e-07, + "loss": 0.0367, + "reward": 1.0848214626312256, + "reward_std": 0.20789780467748642, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9620536118745804, + "step": 2862 + }, + { + "completion_length": 923.1987152099609, + "epoch": 0.8552012545739676, + "grad_norm": 1.1224002838134766, + "kl": 0.8291015625, + "learning_rate": 1.5613324416273353e-07, + "loss": 0.0143, + "reward": 1.1071428954601288, + "reward_std": 0.2566589191555977, + "rewards/accuracy_reward": 0.1629464328289032, + "rewards/format_reward": 0.9441964626312256, + "step": 2863 + }, + { + "completion_length": 875.1138610839844, + "epoch": 0.8554999626614891, + "grad_norm": 1.214990258216858, + "kl": 0.52587890625, + "learning_rate": 1.5590645005447397e-07, + "loss": 0.0157, + "reward": 1.118303656578064, + "reward_std": 0.19220494851469994, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.970982164144516, + "step": 2864 + }, + { + "completion_length": 901.2835235595703, + "epoch": 0.8557986707490105, + "grad_norm": 1.5715945959091187, + "kl": 0.541015625, + "learning_rate": 1.5568008468137148e-07, + "loss": 0.0103, + "reward": 1.1361607909202576, + "reward_std": 0.22631775215268135, + "rewards/accuracy_reward": 0.16517858300358057, + "rewards/format_reward": 0.970982164144516, + "step": 2865 + }, + { + "completion_length": 931.4754791259766, + "epoch": 0.856097378836532, + "grad_norm": 1.8642688989639282, + "kl": 0.6552734375, + "learning_rate": 1.5545414828968944e-07, + "loss": 0.0441, + "reward": 1.0892857909202576, + "reward_std": 0.18106604740023613, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9754464775323868, + "step": 2866 + }, + { + "completion_length": 1010.685302734375, + "epoch": 0.8563960869240534, + "grad_norm": 0.7676990628242493, + "kl": 0.6796875, + "learning_rate": 1.5522864112522433e-07, + "loss": 0.0617, + "reward": 1.0491071939468384, + "reward_std": 0.2490660920739174, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.9441964626312256, + "step": 2867 + }, + { + "completion_length": 904.9308319091797, + "epoch": 0.856694795011575, + "grad_norm": 1.2784074544906616, + "kl": 0.662109375, + "learning_rate": 1.550035634333059e-07, + "loss": 0.0515, + "reward": 1.2209821939468384, + "reward_std": 0.2581138573586941, + "rewards/accuracy_reward": 0.2723214365541935, + "rewards/format_reward": 0.948660746216774, + "step": 2868 + }, + { + "completion_length": 940.6875305175781, + "epoch": 0.8569935030990964, + "grad_norm": 1.2268545627593994, + "kl": 0.45947265625, + "learning_rate": 1.5477891545879674e-07, + "loss": 0.0395, + "reward": 1.0424107611179352, + "reward_std": 0.17255445942282677, + "rewards/accuracy_reward": 0.08035714784637094, + "rewards/format_reward": 0.9620536118745804, + "step": 2869 + }, + { + "completion_length": 872.3460235595703, + "epoch": 0.8572922111866179, + "grad_norm": 1.656377911567688, + "kl": 0.818359375, + "learning_rate": 1.5455469744609163e-07, + "loss": 0.041, + "reward": 1.116071492433548, + "reward_std": 0.3794048950076103, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.9375000447034836, + "step": 2870 + }, + { + "completion_length": 844.5736999511719, + "epoch": 0.8575909192741393, + "grad_norm": 1.009273886680603, + "kl": 0.892578125, + "learning_rate": 1.5433090963911788e-07, + "loss": 0.0936, + "reward": 1.0937500447034836, + "reward_std": 0.3149794451892376, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9330357611179352, + "step": 2871 + }, + { + "completion_length": 984.0067749023438, + "epoch": 0.8578896273616609, + "grad_norm": 0.8224257230758667, + "kl": 0.71728515625, + "learning_rate": 1.5410755228133483e-07, + "loss": 0.0366, + "reward": 1.1316964626312256, + "reward_std": 0.24905426800251007, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9441964775323868, + "step": 2872 + }, + { + "completion_length": 961.5781555175781, + "epoch": 0.8581883354491823, + "grad_norm": 1.6754963397979736, + "kl": 0.60888671875, + "learning_rate": 1.5388462561573315e-07, + "loss": 0.0104, + "reward": 1.1495536267757416, + "reward_std": 0.24062826484441757, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9620536118745804, + "step": 2873 + }, + { + "completion_length": 906.5380096435547, + "epoch": 0.8584870435367038, + "grad_norm": 1.0911351442337036, + "kl": 0.7109375, + "learning_rate": 1.5366212988483532e-07, + "loss": 0.0395, + "reward": 1.1339286416769028, + "reward_std": 0.2215336188673973, + "rewards/accuracy_reward": 0.18303572200238705, + "rewards/format_reward": 0.9508928805589676, + "step": 2874 + }, + { + "completion_length": 885.2388763427734, + "epoch": 0.8587857516242252, + "grad_norm": 0.9269694089889526, + "kl": 0.640625, + "learning_rate": 1.5344006533069503e-07, + "loss": 0.0256, + "reward": 1.082589328289032, + "reward_std": 0.21240580454468727, + "rewards/accuracy_reward": 0.13169643213041127, + "rewards/format_reward": 0.9508928954601288, + "step": 2875 + }, + { + "completion_length": 944.4643402099609, + "epoch": 0.8590844597117467, + "grad_norm": 1.5519870519638062, + "kl": 0.58203125, + "learning_rate": 1.5321843219489645e-07, + "loss": 0.0258, + "reward": 1.1049107611179352, + "reward_std": 0.2160738343372941, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9665178954601288, + "step": 2876 + }, + { + "completion_length": 966.8192291259766, + "epoch": 0.8593831677992682, + "grad_norm": 0.8274990320205688, + "kl": 0.5771484375, + "learning_rate": 1.5299723071855498e-07, + "loss": 0.0346, + "reward": 1.2611607611179352, + "reward_std": 0.2841428071260452, + "rewards/accuracy_reward": 0.3013392947614193, + "rewards/format_reward": 0.9598214775323868, + "step": 2877 + }, + { + "completion_length": 937.1495971679688, + "epoch": 0.8596818758867897, + "grad_norm": 0.9725874066352844, + "kl": 0.7490234375, + "learning_rate": 1.5277646114231596e-07, + "loss": 0.0473, + "reward": 1.1852679252624512, + "reward_std": 0.24077943339943886, + "rewards/accuracy_reward": 0.2299107201397419, + "rewards/format_reward": 0.9553571790456772, + "step": 2878 + }, + { + "completion_length": 843.2857513427734, + "epoch": 0.8599805839743111, + "grad_norm": 1.0436451435089111, + "kl": 0.4521484375, + "learning_rate": 1.5255612370635515e-07, + "loss": 0.0015, + "reward": 1.042410746216774, + "reward_std": 0.17085864394903183, + "rewards/accuracy_reward": 0.07142857648432255, + "rewards/format_reward": 0.9709821790456772, + "step": 2879 + }, + { + "completion_length": 853.0580749511719, + "epoch": 0.8602792920618326, + "grad_norm": 0.8424566984176636, + "kl": 0.703125, + "learning_rate": 1.523362186503781e-07, + "loss": 0.0296, + "reward": 1.1026786267757416, + "reward_std": 0.22341636009514332, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.9620536118745804, + "step": 2880 + }, + { + "completion_length": 843.5245819091797, + "epoch": 0.860578000149354, + "grad_norm": 1.6470218896865845, + "kl": 0.96923828125, + "learning_rate": 1.5211674621361985e-07, + "loss": 0.0378, + "reward": 1.0736607909202576, + "reward_std": 0.2985813021659851, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.941964328289032, + "step": 2881 + }, + { + "completion_length": 950.0312805175781, + "epoch": 0.8608767082368756, + "grad_norm": 1.0814902782440186, + "kl": 0.68359375, + "learning_rate": 1.51897706634845e-07, + "loss": 0.0506, + "reward": 1.0424107760190964, + "reward_std": 0.24431190639734268, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.9508928954601288, + "step": 2882 + }, + { + "completion_length": 833.3326263427734, + "epoch": 0.861175416324397, + "grad_norm": 1.036230206489563, + "kl": 0.453125, + "learning_rate": 1.5167910015234714e-07, + "loss": 0.0291, + "reward": 1.2656250596046448, + "reward_std": 0.21224823594093323, + "rewards/accuracy_reward": 0.2834821529686451, + "rewards/format_reward": 0.9821428954601288, + "step": 2883 + }, + { + "completion_length": 908.2991485595703, + "epoch": 0.8614741244119185, + "grad_norm": 1.8024553060531616, + "kl": 0.4921875, + "learning_rate": 1.5146092700394864e-07, + "loss": 0.0636, + "reward": 1.350446492433548, + "reward_std": 0.295924786478281, + "rewards/accuracy_reward": 0.3816964402794838, + "rewards/format_reward": 0.9687500447034836, + "step": 2884 + }, + { + "completion_length": 903.4040679931641, + "epoch": 0.8617728324994399, + "grad_norm": 1.5594496726989746, + "kl": 0.5966796875, + "learning_rate": 1.512431874270005e-07, + "loss": 0.0233, + "reward": 1.1919643580913544, + "reward_std": 0.2799229919910431, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.9508928954601288, + "step": 2885 + }, + { + "completion_length": 904.779052734375, + "epoch": 0.8620715405869613, + "grad_norm": 0.7702074646949768, + "kl": 0.453125, + "learning_rate": 1.510258816583822e-07, + "loss": -0.0098, + "reward": 1.0558036267757416, + "reward_std": 0.2209235206246376, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.9665178805589676, + "step": 2886 + }, + { + "completion_length": 888.0424499511719, + "epoch": 0.8623702486744829, + "grad_norm": 1.5732890367507935, + "kl": 0.7041015625, + "learning_rate": 1.5080900993450084e-07, + "loss": 0.0311, + "reward": 1.1160714626312256, + "reward_std": 0.24739359691739082, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.957589328289032, + "step": 2887 + }, + { + "completion_length": 812.8058471679688, + "epoch": 0.8626689567620043, + "grad_norm": 1.7595272064208984, + "kl": 0.85107421875, + "learning_rate": 1.5059257249129177e-07, + "loss": 0.023, + "reward": 1.089285746216774, + "reward_std": 0.19645293802022934, + "rewards/accuracy_reward": 0.14508929220028222, + "rewards/format_reward": 0.9441964626312256, + "step": 2888 + }, + { + "completion_length": 893.4687957763672, + "epoch": 0.8629676648495258, + "grad_norm": 1.3325146436691284, + "kl": 0.69921875, + "learning_rate": 1.503765695642178e-07, + "loss": 0.0128, + "reward": 1.0959821790456772, + "reward_std": 0.18219907395541668, + "rewards/accuracy_reward": 0.13169643515720963, + "rewards/format_reward": 0.9642857611179352, + "step": 2889 + }, + { + "completion_length": 932.9397888183594, + "epoch": 0.8632663729370472, + "grad_norm": 1.034636378288269, + "kl": 0.650390625, + "learning_rate": 1.5016100138826873e-07, + "loss": 0.0086, + "reward": 1.0758928954601288, + "reward_std": 0.16686387173831463, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.9754464775323868, + "step": 2890 + }, + { + "completion_length": 917.3125305175781, + "epoch": 0.8635650810245687, + "grad_norm": 0.6212211847305298, + "kl": 0.64013671875, + "learning_rate": 1.4994586819796185e-07, + "loss": 0.0176, + "reward": 1.2053571939468384, + "reward_std": 0.21188941970467567, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.973214328289032, + "step": 2891 + }, + { + "completion_length": 833.2991485595703, + "epoch": 0.8638637891120902, + "grad_norm": 2.209336757659912, + "kl": 0.68310546875, + "learning_rate": 1.497311702273407e-07, + "loss": 0.0269, + "reward": 1.1026786416769028, + "reward_std": 0.2643289491534233, + "rewards/accuracy_reward": 0.14285714784637094, + "rewards/format_reward": 0.9598214775323868, + "step": 2892 + }, + { + "completion_length": 870.372802734375, + "epoch": 0.8641624971996117, + "grad_norm": 1.408028244972229, + "kl": 0.62109375, + "learning_rate": 1.4951690770997582e-07, + "loss": 0.0435, + "reward": 1.1517857611179352, + "reward_std": 0.20261946693062782, + "rewards/accuracy_reward": 0.17410714644938707, + "rewards/format_reward": 0.9776786267757416, + "step": 2893 + }, + { + "completion_length": 944.0803985595703, + "epoch": 0.8644612052871331, + "grad_norm": 1.151188850402832, + "kl": 0.8466796875, + "learning_rate": 1.4930308087896386e-07, + "loss": 0.0537, + "reward": 1.0602678954601288, + "reward_std": 0.24783017113804817, + "rewards/accuracy_reward": 0.09821429196745157, + "rewards/format_reward": 0.9620536118745804, + "step": 2894 + }, + { + "completion_length": 853.2098693847656, + "epoch": 0.8647599133746546, + "grad_norm": 1.2096471786499023, + "kl": 0.6484375, + "learning_rate": 1.490896899669273e-07, + "loss": 0.0139, + "reward": 1.2276786267757416, + "reward_std": 0.24584606662392616, + "rewards/accuracy_reward": 0.2678571492433548, + "rewards/format_reward": 0.9598214775323868, + "step": 2895 + }, + { + "completion_length": 930.8036193847656, + "epoch": 0.865058621462176, + "grad_norm": 0.7572001218795776, + "kl": 0.68408203125, + "learning_rate": 1.4887673520601462e-07, + "loss": 0.0277, + "reward": 1.0870536267757416, + "reward_std": 0.23265226930379868, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.9598214626312256, + "step": 2896 + }, + { + "completion_length": 928.4107666015625, + "epoch": 0.8653573295496976, + "grad_norm": 1.8849115371704102, + "kl": 1.056640625, + "learning_rate": 1.4866421682789985e-07, + "loss": 0.0463, + "reward": 1.0892857611179352, + "reward_std": 0.18615643307566643, + "rewards/accuracy_reward": 0.13169643376022577, + "rewards/format_reward": 0.957589328289032, + "step": 2897 + }, + { + "completion_length": 918.9152221679688, + "epoch": 0.865656037637219, + "grad_norm": 1.1815077066421509, + "kl": 0.8740234375, + "learning_rate": 1.4845213506378192e-07, + "loss": 0.0358, + "reward": 1.07589291036129, + "reward_std": 0.15457626897841692, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.9620536118745804, + "step": 2898 + }, + { + "completion_length": 837.9509429931641, + "epoch": 0.8659547457247405, + "grad_norm": 1.848351240158081, + "kl": 0.7431640625, + "learning_rate": 1.4824049014438512e-07, + "loss": 0.0202, + "reward": 1.1718750596046448, + "reward_std": 0.18936587497591972, + "rewards/accuracy_reward": 0.19642858393490314, + "rewards/format_reward": 0.9754464626312256, + "step": 2899 + }, + { + "completion_length": 809.8348693847656, + "epoch": 0.8662534538122619, + "grad_norm": 1.8045644760131836, + "kl": 1.0537109375, + "learning_rate": 1.4802928229995845e-07, + "loss": 0.046, + "reward": 1.1450893431901932, + "reward_std": 0.2634972743690014, + "rewards/accuracy_reward": 0.18303571874275804, + "rewards/format_reward": 0.9620536118745804, + "step": 2900 + }, + { + "completion_length": 833.3482513427734, + "epoch": 0.8665521618997835, + "grad_norm": 1.7904542684555054, + "kl": 0.8994140625, + "learning_rate": 1.478185117602752e-07, + "loss": 0.0529, + "reward": 1.2142857611179352, + "reward_std": 0.20148493349552155, + "rewards/accuracy_reward": 0.2566964402794838, + "rewards/format_reward": 0.957589328289032, + "step": 2901 + }, + { + "completion_length": 916.4755096435547, + "epoch": 0.8668508699873049, + "grad_norm": 0.8544791340827942, + "kl": 0.9267578125, + "learning_rate": 1.4760817875463318e-07, + "loss": 0.0248, + "reward": 1.1272321939468384, + "reward_std": 0.3067249357700348, + "rewards/accuracy_reward": 0.1763392984867096, + "rewards/format_reward": 0.95089291036129, + "step": 2902 + }, + { + "completion_length": 856.7344207763672, + "epoch": 0.8671495780748264, + "grad_norm": 2.210017442703247, + "kl": 0.51025390625, + "learning_rate": 1.4739828351185407e-07, + "loss": 0.0035, + "reward": 1.1339285969734192, + "reward_std": 0.20241517201066017, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9821428954601288, + "step": 2903 + }, + { + "completion_length": 926.1451263427734, + "epoch": 0.8674482861623478, + "grad_norm": 1.4447946548461914, + "kl": 0.62451171875, + "learning_rate": 1.4718882626028323e-07, + "loss": 0.0212, + "reward": 1.1406250447034836, + "reward_std": 0.1976996473968029, + "rewards/accuracy_reward": 0.1808035857975483, + "rewards/format_reward": 0.9598214626312256, + "step": 2904 + }, + { + "completion_length": 930.950927734375, + "epoch": 0.8677469942498693, + "grad_norm": 1.395098090171814, + "kl": 0.650390625, + "learning_rate": 1.4697980722778976e-07, + "loss": 0.0017, + "reward": 1.0892857909202576, + "reward_std": 0.1979723647236824, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.9687500298023224, + "step": 2905 + }, + { + "completion_length": 851.7678985595703, + "epoch": 0.8680457023373908, + "grad_norm": 1.0263656377792358, + "kl": 0.55615234375, + "learning_rate": 1.4677122664176572e-07, + "loss": 0.019, + "reward": 1.1808036267757416, + "reward_std": 0.22284404560923576, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9687500596046448, + "step": 2906 + }, + { + "completion_length": 939.8839569091797, + "epoch": 0.8683444104249123, + "grad_norm": 1.0978707075119019, + "kl": 0.671875, + "learning_rate": 1.465630847291264e-07, + "loss": 0.008, + "reward": 1.0959821939468384, + "reward_std": 0.22808216139674187, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.957589328289032, + "step": 2907 + }, + { + "completion_length": 978.3728179931641, + "epoch": 0.8686431185124337, + "grad_norm": 0.8929941654205322, + "kl": 0.708984375, + "learning_rate": 1.4635538171630992e-07, + "loss": 0.0239, + "reward": 1.0669643580913544, + "reward_std": 0.3117176443338394, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.95089291036129, + "step": 2908 + }, + { + "completion_length": 953.9397735595703, + "epoch": 0.8689418265999552, + "grad_norm": 1.1981335878372192, + "kl": 0.66357421875, + "learning_rate": 1.4614811782927667e-07, + "loss": 0.0208, + "reward": 1.1026786267757416, + "reward_std": 0.2200748473405838, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.9709821939468384, + "step": 2909 + }, + { + "completion_length": 894.8594207763672, + "epoch": 0.8692405346874766, + "grad_norm": 1.3186969757080078, + "kl": 0.60107421875, + "learning_rate": 1.4594129329350944e-07, + "loss": -0.0281, + "reward": 1.1071429252624512, + "reward_std": 0.2611399404704571, + "rewards/accuracy_reward": 0.14285714738070965, + "rewards/format_reward": 0.964285746216774, + "step": 2910 + }, + { + "completion_length": 880.4308471679688, + "epoch": 0.8695392427749982, + "grad_norm": 2.039381980895996, + "kl": 0.69384765625, + "learning_rate": 1.4573490833401316e-07, + "loss": 0.0136, + "reward": 1.0334821790456772, + "reward_std": 0.2479725293815136, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.9464286118745804, + "step": 2911 + }, + { + "completion_length": 879.6830749511719, + "epoch": 0.8698379508625196, + "grad_norm": 1.5237306356430054, + "kl": 0.57080078125, + "learning_rate": 1.4552896317531436e-07, + "loss": 0.0056, + "reward": 1.1183036267757416, + "reward_std": 0.197877099737525, + "rewards/accuracy_reward": 0.14955357694998384, + "rewards/format_reward": 0.9687500447034836, + "step": 2912 + }, + { + "completion_length": 935.5513916015625, + "epoch": 0.8701366589500411, + "grad_norm": 1.3466796875, + "kl": 0.568359375, + "learning_rate": 1.4532345804146113e-07, + "loss": 0.0186, + "reward": 1.1718750298023224, + "reward_std": 0.2636246085166931, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9709821790456772, + "step": 2913 + }, + { + "completion_length": 904.8058319091797, + "epoch": 0.8704353670375625, + "grad_norm": 1.5193617343902588, + "kl": 0.5947265625, + "learning_rate": 1.4511839315602308e-07, + "loss": -0.0, + "reward": 1.1116071939468384, + "reward_std": 0.27564210444688797, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.9553571939468384, + "step": 2914 + }, + { + "completion_length": 854.3527069091797, + "epoch": 0.870734075125084, + "grad_norm": 1.6732410192489624, + "kl": 0.7314453125, + "learning_rate": 1.449137687420906e-07, + "loss": 0.028, + "reward": 1.194196492433548, + "reward_std": 0.2503204308450222, + "rewards/accuracy_reward": 0.24330357927829027, + "rewards/format_reward": 0.9508928954601288, + "step": 2915 + }, + { + "completion_length": 843.9442291259766, + "epoch": 0.8710327832126055, + "grad_norm": 1.6009917259216309, + "kl": 0.6142578125, + "learning_rate": 1.4470958502227496e-07, + "loss": 0.0071, + "reward": 1.1004464626312256, + "reward_std": 0.24668440595269203, + "rewards/accuracy_reward": 0.133928582072258, + "rewards/format_reward": 0.9665178954601288, + "step": 2916 + }, + { + "completion_length": 905.0893249511719, + "epoch": 0.871331491300127, + "grad_norm": 1.2698081731796265, + "kl": 0.6455078125, + "learning_rate": 1.445058422187082e-07, + "loss": 0.0164, + "reward": 1.1875000596046448, + "reward_std": 0.26137224957346916, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.957589328289032, + "step": 2917 + }, + { + "completion_length": 973.4241485595703, + "epoch": 0.8716301993876484, + "grad_norm": 1.6537861824035645, + "kl": 0.8037109375, + "learning_rate": 1.4430254055304225e-07, + "loss": -0.004, + "reward": 1.0334821939468384, + "reward_std": 0.21529488265514374, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.9508928954601288, + "step": 2918 + }, + { + "completion_length": 898.7187957763672, + "epoch": 0.8719289074751699, + "grad_norm": 0.9614280462265015, + "kl": 0.7734375, + "learning_rate": 1.440996802464497e-07, + "loss": 0.0429, + "reward": 1.1339286267757416, + "reward_std": 0.22765596956014633, + "rewards/accuracy_reward": 0.16517857648432255, + "rewards/format_reward": 0.9687500447034836, + "step": 2919 + }, + { + "completion_length": 1144.7411193847656, + "epoch": 0.8722276155626913, + "grad_norm": 1.1771153211593628, + "kl": 0.80224609375, + "learning_rate": 1.4389726151962242e-07, + "loss": 0.0188, + "reward": 1.071428656578064, + "reward_std": 0.2630849853157997, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.9531250298023224, + "step": 2920 + }, + { + "completion_length": 855.5379791259766, + "epoch": 0.8725263236502129, + "grad_norm": 1.5047301054000854, + "kl": 0.63134765625, + "learning_rate": 1.4369528459277228e-07, + "loss": -0.0078, + "reward": 1.1808036267757416, + "reward_std": 0.2929191067814827, + "rewards/accuracy_reward": 0.2254464365541935, + "rewards/format_reward": 0.9553571790456772, + "step": 2921 + }, + { + "completion_length": 933.6027069091797, + "epoch": 0.8728250317377343, + "grad_norm": 1.304068922996521, + "kl": 0.6474609375, + "learning_rate": 1.4349374968563044e-07, + "loss": 0.0041, + "reward": 1.1383929252624512, + "reward_std": 0.2449670173227787, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.96651791036129, + "step": 2922 + }, + { + "completion_length": 782.4062805175781, + "epoch": 0.8731237398252558, + "grad_norm": 0.9563938975334167, + "kl": 0.63232421875, + "learning_rate": 1.43292657017447e-07, + "loss": 0.0455, + "reward": 1.1875000596046448, + "reward_std": 0.2542719766497612, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.9665178954601288, + "step": 2923 + }, + { + "completion_length": 904.6205749511719, + "epoch": 0.8734224479127772, + "grad_norm": 1.152268886566162, + "kl": 0.62890625, + "learning_rate": 1.4309200680699104e-07, + "loss": -0.0124, + "reward": 1.1227679252624512, + "reward_std": 0.25306694209575653, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9642857611179352, + "step": 2924 + }, + { + "completion_length": 910.6049499511719, + "epoch": 0.8737211560002988, + "grad_norm": 1.8983980417251587, + "kl": 0.63525390625, + "learning_rate": 1.4289179927255058e-07, + "loss": 0.0191, + "reward": 1.0736607611179352, + "reward_std": 0.26252957433462143, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.9508928954601288, + "step": 2925 + }, + { + "completion_length": 862.4732513427734, + "epoch": 0.8740198640878202, + "grad_norm": 0.8896445631980896, + "kl": 0.58984375, + "learning_rate": 1.4269203463193148e-07, + "loss": 0.0043, + "reward": 1.1227679252624512, + "reward_std": 0.18491305783391, + "rewards/accuracy_reward": 0.15625000465661287, + "rewards/format_reward": 0.9665178954601288, + "step": 2926 + }, + { + "completion_length": 963.1987152099609, + "epoch": 0.8743185721753417, + "grad_norm": 1.0832314491271973, + "kl": 0.48974609375, + "learning_rate": 1.424927131024582e-07, + "loss": 0.0259, + "reward": 1.1540178954601288, + "reward_std": 0.18839738331735134, + "rewards/accuracy_reward": 0.17633929569274187, + "rewards/format_reward": 0.9776786118745804, + "step": 2927 + }, + { + "completion_length": 874.9442443847656, + "epoch": 0.8746172802628631, + "grad_norm": 0.862144947052002, + "kl": 0.7197265625, + "learning_rate": 1.4229383490097325e-07, + "loss": 0.0434, + "reward": 1.162946492433548, + "reward_std": 0.24736900255084038, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9776786118745804, + "step": 2928 + }, + { + "completion_length": 844.7210235595703, + "epoch": 0.8749159883503845, + "grad_norm": 1.8000065088272095, + "kl": 0.681640625, + "learning_rate": 1.4209540024383627e-07, + "loss": 0.0578, + "reward": 1.1919643580913544, + "reward_std": 0.2693942226469517, + "rewards/accuracy_reward": 0.23214286658912897, + "rewards/format_reward": 0.9598214626312256, + "step": 2929 + }, + { + "completion_length": 960.6540679931641, + "epoch": 0.8752146964379061, + "grad_norm": 1.5294227600097656, + "kl": 0.4619140625, + "learning_rate": 1.4189740934692497e-07, + "loss": 0.0675, + "reward": 1.1674107611179352, + "reward_std": 0.24162236228585243, + "rewards/accuracy_reward": 0.2053571566939354, + "rewards/format_reward": 0.9620536118745804, + "step": 2930 + }, + { + "completion_length": 947.3571624755859, + "epoch": 0.8755134045254275, + "grad_norm": 1.1052418947219849, + "kl": 0.5703125, + "learning_rate": 1.4169986242563388e-07, + "loss": 0.0464, + "reward": 1.1361607611179352, + "reward_std": 0.2976548671722412, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9598214626312256, + "step": 2931 + }, + { + "completion_length": 818.2120971679688, + "epoch": 0.875812112612949, + "grad_norm": 1.0765101909637451, + "kl": 0.70263671875, + "learning_rate": 1.4150275969487472e-07, + "loss": 0.0245, + "reward": 1.2678571939468384, + "reward_std": 0.30241966620087624, + "rewards/accuracy_reward": 0.2968750111758709, + "rewards/format_reward": 0.9709821790456772, + "step": 2932 + }, + { + "completion_length": 902.9955902099609, + "epoch": 0.8761108207004704, + "grad_norm": 0.829276978969574, + "kl": 0.50146484375, + "learning_rate": 1.4130610136907606e-07, + "loss": -0.0229, + "reward": 1.1138393431901932, + "reward_std": 0.2734360918402672, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.9553571939468384, + "step": 2933 + }, + { + "completion_length": 968.107177734375, + "epoch": 0.8764095287879919, + "grad_norm": 0.892780065536499, + "kl": 0.62646484375, + "learning_rate": 1.4110988766218273e-07, + "loss": 0.0374, + "reward": 1.109375074505806, + "reward_std": 0.20040713623166084, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.9665178954601288, + "step": 2934 + }, + { + "completion_length": 914.4196929931641, + "epoch": 0.8767082368755134, + "grad_norm": 1.9780856370925903, + "kl": 0.724609375, + "learning_rate": 1.4091411878765611e-07, + "loss": 0.0414, + "reward": 1.0758928954601288, + "reward_std": 0.2467649132013321, + "rewards/accuracy_reward": 0.11383928917348385, + "rewards/format_reward": 0.9620536118745804, + "step": 2935 + }, + { + "completion_length": 897.5759429931641, + "epoch": 0.8770069449630349, + "grad_norm": 1.6544933319091797, + "kl": 0.49755859375, + "learning_rate": 1.407187949584736e-07, + "loss": 0.0547, + "reward": 1.2031250298023224, + "reward_std": 0.23986584320664406, + "rewards/accuracy_reward": 0.2455357238650322, + "rewards/format_reward": 0.957589328289032, + "step": 2936 + }, + { + "completion_length": 947.4085235595703, + "epoch": 0.8773056530505563, + "grad_norm": 0.9471513032913208, + "kl": 0.5703125, + "learning_rate": 1.405239163871282e-07, + "loss": -0.0066, + "reward": 1.09151791036129, + "reward_std": 0.2693426124751568, + "rewards/accuracy_reward": 0.13839286426082253, + "rewards/format_reward": 0.9531250447034836, + "step": 2937 + }, + { + "completion_length": 935.6897888183594, + "epoch": 0.8776043611380778, + "grad_norm": 0.8353162407875061, + "kl": 0.455078125, + "learning_rate": 1.4032948328562882e-07, + "loss": -0.0102, + "reward": 1.0625000298023224, + "reward_std": 0.1628045942634344, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.9687500298023224, + "step": 2938 + }, + { + "completion_length": 954.5781707763672, + "epoch": 0.8779030692255992, + "grad_norm": 1.691592812538147, + "kl": 0.75, + "learning_rate": 1.4013549586549972e-07, + "loss": 0.0073, + "reward": 1.0312500298023224, + "reward_std": 0.26134589686989784, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.9352678954601288, + "step": 2939 + }, + { + "completion_length": 922.9286041259766, + "epoch": 0.8782017773131208, + "grad_norm": 1.1243751049041748, + "kl": 0.7138671875, + "learning_rate": 1.3994195433777992e-07, + "loss": 0.0232, + "reward": 1.0558036267757416, + "reward_std": 0.18240543454885483, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.9642857611179352, + "step": 2940 + }, + { + "completion_length": 880.8303985595703, + "epoch": 0.8785004854006422, + "grad_norm": 1.1035772562026978, + "kl": 0.6640625, + "learning_rate": 1.3974885891302386e-07, + "loss": 0.0211, + "reward": 1.2566965222358704, + "reward_std": 0.24497977271676064, + "rewards/accuracy_reward": 0.3102678768336773, + "rewards/format_reward": 0.9464286118745804, + "step": 2941 + }, + { + "completion_length": 937.825927734375, + "epoch": 0.8787991934881637, + "grad_norm": 1.3995137214660645, + "kl": 0.79736328125, + "learning_rate": 1.3955620980130042e-07, + "loss": 0.0751, + "reward": 1.0803572237491608, + "reward_std": 0.2325350157916546, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.957589328289032, + "step": 2942 + }, + { + "completion_length": 940.7522888183594, + "epoch": 0.8790979015756851, + "grad_norm": 1.0003609657287598, + "kl": 0.6484375, + "learning_rate": 1.3936400721219282e-07, + "loss": 0.0062, + "reward": 1.1361607611179352, + "reward_std": 0.2234317511320114, + "rewards/accuracy_reward": 0.17410715483129025, + "rewards/format_reward": 0.9620536267757416, + "step": 2943 + }, + { + "completion_length": 881.2031555175781, + "epoch": 0.8793966096632067, + "grad_norm": 0.7578193545341492, + "kl": 0.60107421875, + "learning_rate": 1.3917225135479882e-07, + "loss": 0.0169, + "reward": 1.1808036267757416, + "reward_std": 0.24553866684436798, + "rewards/accuracy_reward": 0.2276785783469677, + "rewards/format_reward": 0.9531250447034836, + "step": 2944 + }, + { + "completion_length": 882.2589569091797, + "epoch": 0.8796953177507281, + "grad_norm": 0.8559393286705017, + "kl": 0.7666015625, + "learning_rate": 1.3898094243772979e-07, + "loss": 0.0245, + "reward": 1.0892857611179352, + "reward_std": 0.25943197682499886, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.9598214626312256, + "step": 2945 + }, + { + "completion_length": 942.3303985595703, + "epoch": 0.8799940258382496, + "grad_norm": 1.4542768001556396, + "kl": 0.6376953125, + "learning_rate": 1.3879008066911115e-07, + "loss": -0.0171, + "reward": 1.1093750298023224, + "reward_std": 0.23482709378004074, + "rewards/accuracy_reward": 0.16517858020961285, + "rewards/format_reward": 0.9441964477300644, + "step": 2946 + }, + { + "completion_length": 925.0982360839844, + "epoch": 0.880292733925771, + "grad_norm": 0.8617219924926758, + "kl": 0.68701171875, + "learning_rate": 1.3859966625658205e-07, + "loss": 0.023, + "reward": 1.0714286118745804, + "reward_std": 0.19521669670939445, + "rewards/accuracy_reward": 0.09821429336443543, + "rewards/format_reward": 0.973214328289032, + "step": 2947 + }, + { + "completion_length": 910.4174499511719, + "epoch": 0.8805914420132925, + "grad_norm": 1.5538415908813477, + "kl": 0.59326171875, + "learning_rate": 1.384096994072943e-07, + "loss": 0.0168, + "reward": 1.1406250298023224, + "reward_std": 0.2636190913617611, + "rewards/accuracy_reward": 0.17410715483129025, + "rewards/format_reward": 0.9665178954601288, + "step": 2948 + }, + { + "completion_length": 812.1607666015625, + "epoch": 0.880890150100814, + "grad_norm": 1.1914124488830566, + "kl": 0.54736328125, + "learning_rate": 1.3822018032791345e-07, + "loss": -0.0069, + "reward": 1.1383929252624512, + "reward_std": 0.20985575765371323, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9687500298023224, + "step": 2949 + }, + { + "completion_length": 888.2879791259766, + "epoch": 0.8811888581883355, + "grad_norm": 1.1974616050720215, + "kl": 0.56591796875, + "learning_rate": 1.380311092246177e-07, + "loss": -0.002, + "reward": 1.1741071939468384, + "reward_std": 0.18682662770152092, + "rewards/accuracy_reward": 0.2008928693830967, + "rewards/format_reward": 0.9732143133878708, + "step": 2950 + }, + { + "completion_length": 933.5871124267578, + "epoch": 0.8814875662758569, + "grad_norm": 1.297365665435791, + "kl": 0.62744140625, + "learning_rate": 1.378424863030978e-07, + "loss": 0.0597, + "reward": 1.0781250298023224, + "reward_std": 0.22365178540349007, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9553571790456772, + "step": 2951 + }, + { + "completion_length": 870.3928985595703, + "epoch": 0.8817862743633784, + "grad_norm": 1.8843187093734741, + "kl": 0.7626953125, + "learning_rate": 1.3765431176855697e-07, + "loss": 0.009, + "reward": 1.0714285969734192, + "reward_std": 0.2970850095152855, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9464286118745804, + "step": 2952 + }, + { + "completion_length": 918.3884429931641, + "epoch": 0.8820849824508998, + "grad_norm": 0.7120177149772644, + "kl": 0.72998046875, + "learning_rate": 1.374665858257108e-07, + "loss": 0.0094, + "reward": 1.2522321939468384, + "reward_std": 0.260669831186533, + "rewards/accuracy_reward": 0.28348216228187084, + "rewards/format_reward": 0.9687500298023224, + "step": 2953 + }, + { + "completion_length": 939.6562805175781, + "epoch": 0.8823836905384214, + "grad_norm": 1.2105154991149902, + "kl": 0.57373046875, + "learning_rate": 1.3727930867878655e-07, + "loss": 0.0551, + "reward": 1.0937500298023224, + "reward_std": 0.23613003827631474, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.973214328289032, + "step": 2954 + }, + { + "completion_length": 862.5670013427734, + "epoch": 0.8826823986259428, + "grad_norm": 1.177113652229309, + "kl": 0.771484375, + "learning_rate": 1.370924805315235e-07, + "loss": 0.0363, + "reward": 1.191964328289032, + "reward_std": 0.29151613265275955, + "rewards/accuracy_reward": 0.2299107313156128, + "rewards/format_reward": 0.9620536267757416, + "step": 2955 + }, + { + "completion_length": 826.2678985595703, + "epoch": 0.8829811067134643, + "grad_norm": 2.3444130420684814, + "kl": 0.716796875, + "learning_rate": 1.3690610158717244e-07, + "loss": 0.037, + "reward": 1.0491072088479996, + "reward_std": 0.2243981808423996, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.9464285969734192, + "step": 2956 + }, + { + "completion_length": 847.4643249511719, + "epoch": 0.8832798148009857, + "grad_norm": 0.8606436252593994, + "kl": 0.7275390625, + "learning_rate": 1.3672017204849521e-07, + "loss": -0.0038, + "reward": 1.1629464626312256, + "reward_std": 0.2150810770690441, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.9598214775323868, + "step": 2957 + }, + { + "completion_length": 956.3661193847656, + "epoch": 0.8835785228885072, + "grad_norm": 0.5540036559104919, + "kl": 0.50439453125, + "learning_rate": 1.3653469211776507e-07, + "loss": 0.0271, + "reward": 1.0625000298023224, + "reward_std": 0.21325436979532242, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.9620536118745804, + "step": 2958 + }, + { + "completion_length": 961.1741638183594, + "epoch": 0.8838772309760287, + "grad_norm": 0.952890932559967, + "kl": 0.7470703125, + "learning_rate": 1.3634966199676586e-07, + "loss": 0.0386, + "reward": 1.131696492433548, + "reward_std": 0.3092179223895073, + "rewards/accuracy_reward": 0.19196429336443543, + "rewards/format_reward": 0.9397321790456772, + "step": 2959 + }, + { + "completion_length": 903.9576416015625, + "epoch": 0.8841759390635502, + "grad_norm": 1.5273449420928955, + "kl": 0.734375, + "learning_rate": 1.361650818867924e-07, + "loss": 0.0189, + "reward": 1.0781250447034836, + "reward_std": 0.2589598074555397, + "rewards/accuracy_reward": 0.1272321529686451, + "rewards/format_reward": 0.9508928954601288, + "step": 2960 + }, + { + "completion_length": 960.2344055175781, + "epoch": 0.8844746471510716, + "grad_norm": 1.0080692768096924, + "kl": 0.61669921875, + "learning_rate": 1.3598095198864967e-07, + "loss": 0.0387, + "reward": 1.1495535969734192, + "reward_std": 0.2624334655702114, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.9687500596046448, + "step": 2961 + }, + { + "completion_length": 824.513427734375, + "epoch": 0.8847733552385931, + "grad_norm": 1.1108458042144775, + "kl": 0.5810546875, + "learning_rate": 1.3579727250265285e-07, + "loss": 0.042, + "reward": 1.191964328289032, + "reward_std": 0.2606463022530079, + "rewards/accuracy_reward": 0.2299107313156128, + "rewards/format_reward": 0.9620536267757416, + "step": 2962 + }, + { + "completion_length": 895.9710235595703, + "epoch": 0.8850720633261145, + "grad_norm": 1.7129658460617065, + "kl": 0.65283203125, + "learning_rate": 1.3561404362862736e-07, + "loss": 0.0318, + "reward": 1.02901791036129, + "reward_std": 0.21206963807344437, + "rewards/accuracy_reward": 0.06696429010480642, + "rewards/format_reward": 0.9620535969734192, + "step": 2963 + }, + { + "completion_length": 835.1719055175781, + "epoch": 0.8853707714136361, + "grad_norm": 1.927554726600647, + "kl": 0.8671875, + "learning_rate": 1.3543126556590827e-07, + "loss": 0.0253, + "reward": 1.2209821939468384, + "reward_std": 0.278893293812871, + "rewards/accuracy_reward": 0.2656250111758709, + "rewards/format_reward": 0.9553571790456772, + "step": 2964 + }, + { + "completion_length": 960.450927734375, + "epoch": 0.8856694795011575, + "grad_norm": 1.1426066160202026, + "kl": 0.7998046875, + "learning_rate": 1.352489385133401e-07, + "loss": 0.0382, + "reward": 1.066964328289032, + "reward_std": 0.2779373973608017, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.9441964626312256, + "step": 2965 + }, + { + "completion_length": 898.8951416015625, + "epoch": 0.885968187588679, + "grad_norm": 1.261110544204712, + "kl": 0.775390625, + "learning_rate": 1.3506706266927677e-07, + "loss": 0.0321, + "reward": 1.073660746216774, + "reward_std": 0.2283652238547802, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9598214626312256, + "step": 2966 + }, + { + "completion_length": 934.71435546875, + "epoch": 0.8862668956762004, + "grad_norm": 1.028701901435852, + "kl": 0.6513671875, + "learning_rate": 1.348856382315816e-07, + "loss": 0.0137, + "reward": 1.102678656578064, + "reward_std": 0.23595517501235008, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9553571939468384, + "step": 2967 + }, + { + "completion_length": 884.4710235595703, + "epoch": 0.886565603763722, + "grad_norm": 1.2840790748596191, + "kl": 0.6416015625, + "learning_rate": 1.3470466539762637e-07, + "loss": 0.0203, + "reward": 1.0781250596046448, + "reward_std": 0.1863383986055851, + "rewards/accuracy_reward": 0.10044643143191934, + "rewards/format_reward": 0.9776785969734192, + "step": 2968 + }, + { + "completion_length": 892.0714721679688, + "epoch": 0.8868643118512434, + "grad_norm": 1.1512657403945923, + "kl": 0.7265625, + "learning_rate": 1.345241443642919e-07, + "loss": -0.0109, + "reward": 1.0647321790456772, + "reward_std": 0.20323598384857178, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.9486607611179352, + "step": 2969 + }, + { + "completion_length": 873.5402221679688, + "epoch": 0.8871630199387649, + "grad_norm": 0.8662549257278442, + "kl": 0.59716796875, + "learning_rate": 1.3434407532796738e-07, + "loss": 0.0288, + "reward": 1.1517857611179352, + "reward_std": 0.21815725415945053, + "rewards/accuracy_reward": 0.1808035746216774, + "rewards/format_reward": 0.9709821790456772, + "step": 2970 + }, + { + "completion_length": 921.5670013427734, + "epoch": 0.8874617280262863, + "grad_norm": 1.2752851247787476, + "kl": 0.61279296875, + "learning_rate": 1.3416445848455015e-07, + "loss": 0.042, + "reward": 1.0736607611179352, + "reward_std": 0.2085910066962242, + "rewards/accuracy_reward": 0.11830358067527413, + "rewards/format_reward": 0.9553571790456772, + "step": 2971 + }, + { + "completion_length": 991.4375457763672, + "epoch": 0.8877604361138077, + "grad_norm": 0.8684391975402832, + "kl": 0.60986328125, + "learning_rate": 1.3398529402944596e-07, + "loss": 0.0286, + "reward": 1.02901791036129, + "reward_std": 0.18485337495803833, + "rewards/accuracy_reward": 0.06919643329456449, + "rewards/format_reward": 0.9598214626312256, + "step": 2972 + }, + { + "completion_length": 951.6942443847656, + "epoch": 0.8880591442013293, + "grad_norm": 1.511452078819275, + "kl": 0.44091796875, + "learning_rate": 1.3380658215756795e-07, + "loss": 0.0336, + "reward": 1.2276785969734192, + "reward_std": 0.22697585448622704, + "rewards/accuracy_reward": 0.2544642984867096, + "rewards/format_reward": 0.973214328289032, + "step": 2973 + }, + { + "completion_length": 846.7299499511719, + "epoch": 0.8883578522888507, + "grad_norm": 1.0080264806747437, + "kl": 0.5771484375, + "learning_rate": 1.3362832306333722e-07, + "loss": 0.0337, + "reward": 1.3281250596046448, + "reward_std": 0.25927015393972397, + "rewards/accuracy_reward": 0.3437500149011612, + "rewards/format_reward": 0.9843750298023224, + "step": 2974 + }, + { + "completion_length": 820.1763763427734, + "epoch": 0.8886565603763722, + "grad_norm": 1.4763351678848267, + "kl": 0.4833984375, + "learning_rate": 1.3345051694068222e-07, + "loss": 0.0128, + "reward": 1.066964328289032, + "reward_std": 0.22994378581643105, + "rewards/accuracy_reward": 0.09598214831203222, + "rewards/format_reward": 0.970982164144516, + "step": 2975 + }, + { + "completion_length": 851.1071929931641, + "epoch": 0.8889552684638936, + "grad_norm": 0.8486505746841431, + "kl": 0.59521484375, + "learning_rate": 1.332731639830385e-07, + "loss": 0.0007, + "reward": 1.2276786267757416, + "reward_std": 0.2824692353606224, + "rewards/accuracy_reward": 0.2633928656578064, + "rewards/format_reward": 0.964285746216774, + "step": 2976 + }, + { + "completion_length": 814.6451263427734, + "epoch": 0.8892539765514151, + "grad_norm": 0.6434459686279297, + "kl": 0.48876953125, + "learning_rate": 1.3309626438334876e-07, + "loss": -0.0006, + "reward": 1.2209821939468384, + "reward_std": 0.21313732862472534, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.9821428954601288, + "step": 2977 + }, + { + "completion_length": 819.3460083007812, + "epoch": 0.8895526846389366, + "grad_norm": 1.7574031352996826, + "kl": 0.5986328125, + "learning_rate": 1.329198183340625e-07, + "loss": 0.0437, + "reward": 1.3102679252624512, + "reward_std": 0.30312730371952057, + "rewards/accuracy_reward": 0.3415178768336773, + "rewards/format_reward": 0.9687500298023224, + "step": 2978 + }, + { + "completion_length": 910.4553985595703, + "epoch": 0.8898513927264581, + "grad_norm": 1.0224663019180298, + "kl": 0.66455078125, + "learning_rate": 1.327438260271355e-07, + "loss": 0.0452, + "reward": 1.147321492433548, + "reward_std": 0.300796065479517, + "rewards/accuracy_reward": 0.20758929569274187, + "rewards/format_reward": 0.9397321790456772, + "step": 2979 + }, + { + "completion_length": 996.0111846923828, + "epoch": 0.8901501008139795, + "grad_norm": 1.6596397161483765, + "kl": 0.6328125, + "learning_rate": 1.3256828765403038e-07, + "loss": 0.0163, + "reward": 1.0401786267757416, + "reward_std": 0.22571245208382607, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.9553571790456772, + "step": 2980 + }, + { + "completion_length": 933.5647735595703, + "epoch": 0.890448808901501, + "grad_norm": 1.0400243997573853, + "kl": 0.607421875, + "learning_rate": 1.323932034057156e-07, + "loss": 0.0142, + "reward": 1.0200893431901932, + "reward_std": 0.1847705990076065, + "rewards/accuracy_reward": 0.06473214644938707, + "rewards/format_reward": 0.9553571790456772, + "step": 2981 + }, + { + "completion_length": 869.5111999511719, + "epoch": 0.8907475169890224, + "grad_norm": 1.630876898765564, + "kl": 0.6962890625, + "learning_rate": 1.322185734726656e-07, + "loss": 0.0052, + "reward": 1.227678656578064, + "reward_std": 0.35921983048319817, + "rewards/accuracy_reward": 0.2924107164144516, + "rewards/format_reward": 0.9352678954601288, + "step": 2982 + }, + { + "completion_length": 1003.5000457763672, + "epoch": 0.891046225076544, + "grad_norm": 1.4276829957962036, + "kl": 0.81591796875, + "learning_rate": 1.3204439804486061e-07, + "loss": 0.0225, + "reward": 1.0602679252624512, + "reward_std": 0.24201371893286705, + "rewards/accuracy_reward": 0.10267857555299997, + "rewards/format_reward": 0.957589328289032, + "step": 2983 + }, + { + "completion_length": 956.7031555175781, + "epoch": 0.8913449331640654, + "grad_norm": 1.003865361213684, + "kl": 0.5576171875, + "learning_rate": 1.3187067731178666e-07, + "loss": -0.0038, + "reward": 1.0825893431901932, + "reward_std": 0.24623103439807892, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.9642857611179352, + "step": 2984 + }, + { + "completion_length": 894.7076263427734, + "epoch": 0.8916436412515869, + "grad_norm": 0.8347485661506653, + "kl": 0.74365234375, + "learning_rate": 1.316974114624346e-07, + "loss": 0.0145, + "reward": 1.0982143580913544, + "reward_std": 0.2607959620654583, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.9508928954601288, + "step": 2985 + }, + { + "completion_length": 928.1986999511719, + "epoch": 0.8919423493391083, + "grad_norm": 1.3830492496490479, + "kl": 0.68505859375, + "learning_rate": 1.3152460068530089e-07, + "loss": 0.0339, + "reward": 1.0825893431901932, + "reward_std": 0.24173326045274734, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/format_reward": 0.9575893431901932, + "step": 2986 + }, + { + "completion_length": 868.5513763427734, + "epoch": 0.8922410574266298, + "grad_norm": 1.0073171854019165, + "kl": 0.8310546875, + "learning_rate": 1.3135224516838656e-07, + "loss": 0.0355, + "reward": 1.1406250298023224, + "reward_std": 0.24068864807486534, + "rewards/accuracy_reward": 0.16964286379516125, + "rewards/format_reward": 0.970982164144516, + "step": 2987 + }, + { + "completion_length": 898.4219207763672, + "epoch": 0.8925397655141513, + "grad_norm": 0.9124859571456909, + "kl": 0.64208984375, + "learning_rate": 1.3118034509919759e-07, + "loss": 0.0307, + "reward": 1.1919643580913544, + "reward_std": 0.2433728538453579, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.9620536118745804, + "step": 2988 + }, + { + "completion_length": 978.8527069091797, + "epoch": 0.8928384736016728, + "grad_norm": 0.7599155902862549, + "kl": 0.59521484375, + "learning_rate": 1.3100890066474454e-07, + "loss": 0.0163, + "reward": 1.129464328289032, + "reward_std": 0.20617384277284145, + "rewards/accuracy_reward": 0.16294643771834671, + "rewards/format_reward": 0.9665178954601288, + "step": 2989 + }, + { + "completion_length": 949.9330902099609, + "epoch": 0.8931371816891942, + "grad_norm": 0.8541893362998962, + "kl": 0.76953125, + "learning_rate": 1.3083791205154187e-07, + "loss": 0.0161, + "reward": 1.1316964626312256, + "reward_std": 0.25112859159708023, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.9575893133878708, + "step": 2990 + }, + { + "completion_length": 892.2835388183594, + "epoch": 0.8934358897767157, + "grad_norm": 1.4231452941894531, + "kl": 0.53515625, + "learning_rate": 1.3066737944560867e-07, + "loss": 0.0377, + "reward": 1.1071428954601288, + "reward_std": 0.2101772278547287, + "rewards/accuracy_reward": 0.14285715017467737, + "rewards/format_reward": 0.964285746216774, + "step": 2991 + }, + { + "completion_length": 891.4353179931641, + "epoch": 0.8937345978642371, + "grad_norm": 1.3617289066314697, + "kl": 0.7373046875, + "learning_rate": 1.3049730303246761e-07, + "loss": -0.0016, + "reward": 1.1227678954601288, + "reward_std": 0.20888399332761765, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.9665178954601288, + "step": 2992 + }, + { + "completion_length": 874.1049652099609, + "epoch": 0.8940333059517587, + "grad_norm": 1.8761346340179443, + "kl": 0.73681640625, + "learning_rate": 1.3032768299714517e-07, + "loss": 0.0668, + "reward": 1.0602678954601288, + "reward_std": 0.2269027754664421, + "rewards/accuracy_reward": 0.09821429033763707, + "rewards/format_reward": 0.9620536118745804, + "step": 2993 + }, + { + "completion_length": 1007.6763763427734, + "epoch": 0.8943320140392801, + "grad_norm": 1.5744541883468628, + "kl": 0.73095703125, + "learning_rate": 1.3015851952417125e-07, + "loss": 0.0249, + "reward": 1.1629465073347092, + "reward_std": 0.2307380847632885, + "rewards/accuracy_reward": 0.20312500232830644, + "rewards/format_reward": 0.9598214775323868, + "step": 2994 + }, + { + "completion_length": 876.1094207763672, + "epoch": 0.8946307221268016, + "grad_norm": 1.3249696493148804, + "kl": 0.81201171875, + "learning_rate": 1.2998981279757932e-07, + "loss": -0.0122, + "reward": 1.0580357611179352, + "reward_std": 0.304029144346714, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.9464285969734192, + "step": 2995 + }, + { + "completion_length": 908.6920013427734, + "epoch": 0.894929430214323, + "grad_norm": 1.3726760149002075, + "kl": 0.66015625, + "learning_rate": 1.2982156300090557e-07, + "loss": 0.0431, + "reward": 1.1607143580913544, + "reward_std": 0.23408927395939827, + "rewards/accuracy_reward": 0.1897321566939354, + "rewards/format_reward": 0.970982164144516, + "step": 2996 + }, + { + "completion_length": 924.3750457763672, + "epoch": 0.8952281383018446, + "grad_norm": 1.1877254247665405, + "kl": 0.86328125, + "learning_rate": 1.2965377031718934e-07, + "loss": 0.0553, + "reward": 1.0647321939468384, + "reward_std": 0.22345047071576118, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.957589328289032, + "step": 2997 + }, + { + "completion_length": 1040.5469055175781, + "epoch": 0.895526846389366, + "grad_norm": 1.1433027982711792, + "kl": 0.83056640625, + "learning_rate": 1.2948643492897276e-07, + "loss": 0.0547, + "reward": 1.0558036267757416, + "reward_std": 0.20574037078768015, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.9598214775323868, + "step": 2998 + }, + { + "completion_length": 839.8594055175781, + "epoch": 0.8958255544768875, + "grad_norm": 1.1698261499404907, + "kl": 0.7255859375, + "learning_rate": 1.293195570183001e-07, + "loss": 0.0504, + "reward": 1.0602679252624512, + "reward_std": 0.2440231293439865, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.9620536118745804, + "step": 2999 + }, + { + "completion_length": 811.6942138671875, + "epoch": 0.8961242625644089, + "grad_norm": 1.080064058303833, + "kl": 0.615234375, + "learning_rate": 1.2915313676671838e-07, + "loss": 0.0152, + "reward": 1.2142857611179352, + "reward_std": 0.20487337186932564, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.973214328289032, + "step": 3000 + }, + { + "completion_length": 938.9219055175781, + "epoch": 0.8964229706519304, + "grad_norm": 0.7319203019142151, + "kl": 0.595703125, + "learning_rate": 1.2898717435527636e-07, + "loss": 0.024, + "reward": 1.2031250298023224, + "reward_std": 0.24842631816864014, + "rewards/accuracy_reward": 0.2366071492433548, + "rewards/format_reward": 0.9665178954601288, + "step": 3001 + }, + { + "completion_length": 898.6942291259766, + "epoch": 0.8967216787394519, + "grad_norm": 1.035291314125061, + "kl": 0.6650390625, + "learning_rate": 1.2882166996452497e-07, + "loss": 0.0191, + "reward": 1.1361607611179352, + "reward_std": 0.2926664911210537, + "rewards/accuracy_reward": 0.17633929662406445, + "rewards/format_reward": 0.9598214775323868, + "step": 3002 + }, + { + "completion_length": 907.5826416015625, + "epoch": 0.8970203868269734, + "grad_norm": 2.816518545150757, + "kl": 0.6845703125, + "learning_rate": 1.2865662377451678e-07, + "loss": 0.0292, + "reward": 1.1629464626312256, + "reward_std": 0.3038870319724083, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.9531250447034836, + "step": 3003 + }, + { + "completion_length": 879.6875457763672, + "epoch": 0.8973190949144948, + "grad_norm": 0.9477717280387878, + "kl": 0.7353515625, + "learning_rate": 1.284920359648058e-07, + "loss": -0.0198, + "reward": 1.0781250447034836, + "reward_std": 0.24199426546692848, + "rewards/accuracy_reward": 0.12053571990691125, + "rewards/format_reward": 0.957589328289032, + "step": 3004 + }, + { + "completion_length": 840.3772735595703, + "epoch": 0.8976178030020163, + "grad_norm": 1.4803879261016846, + "kl": 0.65625, + "learning_rate": 1.2832790671444745e-07, + "loss": 0.0193, + "reward": 1.1227679252624512, + "reward_std": 0.18170782178640366, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.9776786267757416, + "step": 3005 + }, + { + "completion_length": 901.8862152099609, + "epoch": 0.8979165110895377, + "grad_norm": 2.5313122272491455, + "kl": 0.677734375, + "learning_rate": 1.2816423620199837e-07, + "loss": 0.0204, + "reward": 1.2098215222358704, + "reward_std": 0.2733233645558357, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.9598214626312256, + "step": 3006 + }, + { + "completion_length": 927.7098541259766, + "epoch": 0.8982152191770593, + "grad_norm": 1.5293514728546143, + "kl": 0.60205078125, + "learning_rate": 1.2800102460551587e-07, + "loss": 0.0352, + "reward": 1.129464328289032, + "reward_std": 0.2078311126679182, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9754464626312256, + "step": 3007 + }, + { + "completion_length": 890.6049652099609, + "epoch": 0.8985139272645807, + "grad_norm": 1.6721173524856567, + "kl": 0.70361328125, + "learning_rate": 1.2783827210255826e-07, + "loss": 0.0537, + "reward": 1.1450893431901932, + "reward_std": 0.18744824826717377, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9620536267757416, + "step": 3008 + }, + { + "completion_length": 849.357177734375, + "epoch": 0.8988126353521022, + "grad_norm": 1.1006680727005005, + "kl": 0.75341796875, + "learning_rate": 1.2767597887018435e-07, + "loss": 0.0234, + "reward": 1.0870536416769028, + "reward_std": 0.22231462225317955, + "rewards/accuracy_reward": 0.13392857764847577, + "rewards/format_reward": 0.9531250596046448, + "step": 3009 + }, + { + "completion_length": 898.591552734375, + "epoch": 0.8991113434396236, + "grad_norm": 1.3769924640655518, + "kl": 0.744140625, + "learning_rate": 1.275141450849532e-07, + "loss": -0.0171, + "reward": 1.073660746216774, + "reward_std": 0.28149888664484024, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9486607611179352, + "step": 3010 + }, + { + "completion_length": 859.7277221679688, + "epoch": 0.8994100515271451, + "grad_norm": 1.6071274280548096, + "kl": 0.7158203125, + "learning_rate": 1.2735277092292406e-07, + "loss": 0.0035, + "reward": 1.0825893133878708, + "reward_std": 0.16844256222248077, + "rewards/accuracy_reward": 0.12500000465661287, + "rewards/format_reward": 0.957589328289032, + "step": 3011 + }, + { + "completion_length": 875.669677734375, + "epoch": 0.8997087596146666, + "grad_norm": 2.2187886238098145, + "kl": 0.7998046875, + "learning_rate": 1.2719185655965643e-07, + "loss": -0.0085, + "reward": 1.1428571939468384, + "reward_std": 0.19580146297812462, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.9620536118745804, + "step": 3012 + }, + { + "completion_length": 820.4955749511719, + "epoch": 0.9000074677021881, + "grad_norm": 2.742849588394165, + "kl": 0.77734375, + "learning_rate": 1.270314021702091e-07, + "loss": 0.0139, + "reward": 1.2165179550647736, + "reward_std": 0.2521437630057335, + "rewards/accuracy_reward": 0.2544642947614193, + "rewards/format_reward": 0.9620536118745804, + "step": 3013 + }, + { + "completion_length": 868.9754943847656, + "epoch": 0.9003061757897095, + "grad_norm": 1.362518548965454, + "kl": 0.5625, + "learning_rate": 1.2687140792914095e-07, + "loss": 0.0008, + "reward": 1.1049107909202576, + "reward_std": 0.2113107442855835, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.970982164144516, + "step": 3014 + }, + { + "completion_length": 949.8170013427734, + "epoch": 0.9006048838772309, + "grad_norm": 1.152360439300537, + "kl": 0.755859375, + "learning_rate": 1.267118740105098e-07, + "loss": 0.0193, + "reward": 1.082589328289032, + "reward_std": 0.21666078455746174, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.9642857611179352, + "step": 3015 + }, + { + "completion_length": 914.7835388183594, + "epoch": 0.9009035919647524, + "grad_norm": 1.7422845363616943, + "kl": 0.9599609375, + "learning_rate": 1.2655280058787305e-07, + "loss": 0.0289, + "reward": 1.178571492433548, + "reward_std": 0.25384480878710747, + "rewards/accuracy_reward": 0.227678582072258, + "rewards/format_reward": 0.95089291036129, + "step": 3016 + }, + { + "completion_length": 882.232177734375, + "epoch": 0.9012023000522739, + "grad_norm": 1.640181541442871, + "kl": 0.8203125, + "learning_rate": 1.26394187834287e-07, + "loss": -0.0079, + "reward": 1.1629464626312256, + "reward_std": 0.30568627640604973, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9397321790456772, + "step": 3017 + }, + { + "completion_length": 806.0937957763672, + "epoch": 0.9015010081397954, + "grad_norm": 1.9031565189361572, + "kl": 0.8447265625, + "learning_rate": 1.262360359223067e-07, + "loss": 0.0408, + "reward": 1.1562500149011612, + "reward_std": 0.2592725418508053, + "rewards/accuracy_reward": 0.20758929708972573, + "rewards/format_reward": 0.9486607611179352, + "step": 3018 + }, + { + "completion_length": 874.5513763427734, + "epoch": 0.9017997162273168, + "grad_norm": 1.329389214515686, + "kl": 0.7060546875, + "learning_rate": 1.2607834502398602e-07, + "loss": 0.0433, + "reward": 1.066964328289032, + "reward_std": 0.1831890381872654, + "rewards/accuracy_reward": 0.10267857322469354, + "rewards/format_reward": 0.9642857611179352, + "step": 3019 + }, + { + "completion_length": 837.9888763427734, + "epoch": 0.9020984243148383, + "grad_norm": 1.0911887884140015, + "kl": 0.44189453125, + "learning_rate": 1.259211153108772e-07, + "loss": 0.0023, + "reward": 1.2566964626312256, + "reward_std": 0.25125059112906456, + "rewards/accuracy_reward": 0.2790178768336773, + "rewards/format_reward": 0.9776786118745804, + "step": 3020 + }, + { + "completion_length": 953.6384429931641, + "epoch": 0.9023971324023597, + "grad_norm": 1.051392912864685, + "kl": 0.5634765625, + "learning_rate": 1.2576434695403068e-07, + "loss": 0.0125, + "reward": 1.1383928954601288, + "reward_std": 0.17263711243867874, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9687500447034836, + "step": 3021 + }, + { + "completion_length": 820.4754791259766, + "epoch": 0.9026958404898813, + "grad_norm": 1.7317177057266235, + "kl": 0.83837890625, + "learning_rate": 1.2560804012399512e-07, + "loss": 0.0466, + "reward": 1.1383928954601288, + "reward_std": 0.22639509290456772, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.9665178954601288, + "step": 3022 + }, + { + "completion_length": 832.9286041259766, + "epoch": 0.9029945485774027, + "grad_norm": 1.258238673210144, + "kl": 0.81396484375, + "learning_rate": 1.2545219499081707e-07, + "loss": 0.0278, + "reward": 1.1674107611179352, + "reward_std": 0.26279040426015854, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.9531250298023224, + "step": 3023 + }, + { + "completion_length": 944.8036193847656, + "epoch": 0.9032932566649242, + "grad_norm": 0.7732100486755371, + "kl": 0.48486328125, + "learning_rate": 1.2529681172404063e-07, + "loss": 0.0291, + "reward": 1.098214328289032, + "reward_std": 0.21889065951108932, + "rewards/accuracy_reward": 0.13169643096625805, + "rewards/format_reward": 0.9665178954601288, + "step": 3024 + }, + { + "completion_length": 810.2656707763672, + "epoch": 0.9035919647524456, + "grad_norm": 1.2539957761764526, + "kl": 0.8466796875, + "learning_rate": 1.2514189049270776e-07, + "loss": 0.0335, + "reward": 1.2187500596046448, + "reward_std": 0.20952018070966005, + "rewards/accuracy_reward": 0.2433035783469677, + "rewards/format_reward": 0.9754464626312256, + "step": 3025 + }, + { + "completion_length": 964.4174652099609, + "epoch": 0.9038906728399672, + "grad_norm": 0.9735034108161926, + "kl": 0.67529296875, + "learning_rate": 1.2498743146535737e-07, + "loss": 0.0157, + "reward": 1.0290178656578064, + "reward_std": 0.21779046580195427, + "rewards/accuracy_reward": 0.06919643399305642, + "rewards/format_reward": 0.9598214775323868, + "step": 3026 + }, + { + "completion_length": 951.8281860351562, + "epoch": 0.9041893809274886, + "grad_norm": 0.9868967533111572, + "kl": 0.6337890625, + "learning_rate": 1.248334348100258e-07, + "loss": 0.0311, + "reward": 1.0691964775323868, + "reward_std": 0.26833343505859375, + "rewards/accuracy_reward": 0.12500000465661287, + "rewards/format_reward": 0.9441964626312256, + "step": 3027 + }, + { + "completion_length": 889.6562652587891, + "epoch": 0.9044880890150101, + "grad_norm": 2.5625598430633545, + "kl": 0.8525390625, + "learning_rate": 1.246799006942465e-07, + "loss": 0.0264, + "reward": 1.0602678954601288, + "reward_std": 0.17581956647336483, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.95089291036129, + "step": 3028 + }, + { + "completion_length": 853.8549652099609, + "epoch": 0.9047867971025315, + "grad_norm": 0.9624167084693909, + "kl": 0.609375, + "learning_rate": 1.245268292850493e-07, + "loss": 0.0606, + "reward": 1.1986607611179352, + "reward_std": 0.23781999200582504, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9866071790456772, + "step": 3029 + }, + { + "completion_length": 852.294677734375, + "epoch": 0.905085505190053, + "grad_norm": 0.96775883436203, + "kl": 0.56298828125, + "learning_rate": 1.2437422074896093e-07, + "loss": 0.0309, + "reward": 1.1049107313156128, + "reward_std": 0.2053859606385231, + "rewards/accuracy_reward": 0.14508929336443543, + "rewards/format_reward": 0.9598214626312256, + "step": 3030 + }, + { + "completion_length": 895.810302734375, + "epoch": 0.9053842132775745, + "grad_norm": 0.7278904318809509, + "kl": 0.68603515625, + "learning_rate": 1.2422207525200468e-07, + "loss": 0.0056, + "reward": 1.1785715222358704, + "reward_std": 0.27238115668296814, + "rewards/accuracy_reward": 0.2254464365541935, + "rewards/format_reward": 0.9531250596046448, + "step": 3031 + }, + { + "completion_length": 906.4621124267578, + "epoch": 0.905682921365096, + "grad_norm": 0.8265717625617981, + "kl": 0.7431640625, + "learning_rate": 1.2407039295969978e-07, + "loss": 0.0592, + "reward": 1.084821492433548, + "reward_std": 0.24666588753461838, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.9665178805589676, + "step": 3032 + }, + { + "completion_length": 908.1942138671875, + "epoch": 0.9059816294526174, + "grad_norm": 0.9555766582489014, + "kl": 0.80078125, + "learning_rate": 1.2391917403706172e-07, + "loss": 0.0859, + "reward": 1.0714286267757416, + "reward_std": 0.26074620708823204, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.957589328289032, + "step": 3033 + }, + { + "completion_length": 937.4286041259766, + "epoch": 0.9062803375401389, + "grad_norm": 0.8382599353790283, + "kl": 0.55810546875, + "learning_rate": 1.2376841864860201e-07, + "loss": 0.0219, + "reward": 1.0848214626312256, + "reward_std": 0.22484927624464035, + "rewards/accuracy_reward": 0.13616072246804833, + "rewards/format_reward": 0.9486607611179352, + "step": 3034 + }, + { + "completion_length": 854.5848541259766, + "epoch": 0.9065790456276603, + "grad_norm": 0.5979026556015015, + "kl": 0.47509765625, + "learning_rate": 1.2361812695832754e-07, + "loss": -0.0, + "reward": 1.2165178954601288, + "reward_std": 0.21507207304239273, + "rewards/accuracy_reward": 0.2477678693830967, + "rewards/format_reward": 0.9687500447034836, + "step": 3035 + }, + { + "completion_length": 979.2076416015625, + "epoch": 0.9068777537151819, + "grad_norm": 3.283942222595215, + "kl": 0.65185546875, + "learning_rate": 1.2346829912974105e-07, + "loss": 0.0208, + "reward": 1.1272321939468384, + "reward_std": 0.17133375629782677, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.979910746216774, + "step": 3036 + }, + { + "completion_length": 843.7187957763672, + "epoch": 0.9071764618027033, + "grad_norm": 1.1209344863891602, + "kl": 0.7509765625, + "learning_rate": 1.233189353258405e-07, + "loss": 0.0369, + "reward": 1.1919643580913544, + "reward_std": 0.2820412367582321, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.9553571939468384, + "step": 3037 + }, + { + "completion_length": 995.2545013427734, + "epoch": 0.9074751698902248, + "grad_norm": 2.2835853099823, + "kl": 0.76025390625, + "learning_rate": 1.2317003570911907e-07, + "loss": 0.0294, + "reward": 1.147321492433548, + "reward_std": 0.2579009383916855, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9464286267757416, + "step": 3038 + }, + { + "completion_length": 863.4576416015625, + "epoch": 0.9077738779777462, + "grad_norm": 0.9668014645576477, + "kl": 0.7626953125, + "learning_rate": 1.23021600441565e-07, + "loss": 0.0491, + "reward": 1.0915178954601288, + "reward_std": 0.1780168367549777, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.9687500447034836, + "step": 3039 + }, + { + "completion_length": 807.8951263427734, + "epoch": 0.9080725860652678, + "grad_norm": 1.1504569053649902, + "kl": 0.734375, + "learning_rate": 1.2287362968466123e-07, + "loss": 0.0303, + "reward": 1.0892857909202576, + "reward_std": 0.20500651746988297, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9754464775323868, + "step": 3040 + }, + { + "completion_length": 870.3080902099609, + "epoch": 0.9083712941527892, + "grad_norm": 0.9915047883987427, + "kl": 0.8134765625, + "learning_rate": 1.2272612359938548e-07, + "loss": 0.0252, + "reward": 1.100446492433548, + "reward_std": 0.21942921727895737, + "rewards/accuracy_reward": 0.14062500861473382, + "rewards/format_reward": 0.9598214775323868, + "step": 3041 + }, + { + "completion_length": 907.5245819091797, + "epoch": 0.9086700022403107, + "grad_norm": 2.146225690841675, + "kl": 0.703125, + "learning_rate": 1.2257908234620993e-07, + "loss": 0.0325, + "reward": 1.2700893580913544, + "reward_std": 0.3157501593232155, + "rewards/accuracy_reward": 0.3169642947614193, + "rewards/format_reward": 0.9531250447034836, + "step": 3042 + }, + { + "completion_length": 917.1763610839844, + "epoch": 0.9089687103278321, + "grad_norm": 0.7200402617454529, + "kl": 0.51416015625, + "learning_rate": 1.22432506085101e-07, + "loss": -0.0049, + "reward": 1.1004464626312256, + "reward_std": 0.20206012669950724, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9665178954601288, + "step": 3043 + }, + { + "completion_length": 875.8393249511719, + "epoch": 0.9092674184153536, + "grad_norm": 1.2850645780563354, + "kl": 0.65673828125, + "learning_rate": 1.2228639497551936e-07, + "loss": 0.0322, + "reward": 1.145089328289032, + "reward_std": 0.24491293355822563, + "rewards/accuracy_reward": 0.17410715017467737, + "rewards/format_reward": 0.9709821790456772, + "step": 3044 + }, + { + "completion_length": 857.669677734375, + "epoch": 0.909566126502875, + "grad_norm": 1.1618154048919678, + "kl": 0.67919921875, + "learning_rate": 1.2214074917641957e-07, + "loss": 0.0339, + "reward": 1.1205357611179352, + "reward_std": 0.2245286926627159, + "rewards/accuracy_reward": 0.15625000605359674, + "rewards/format_reward": 0.9642857611179352, + "step": 3045 + }, + { + "completion_length": 931.7031707763672, + "epoch": 0.9098648345903966, + "grad_norm": 1.4391411542892456, + "kl": 0.8583984375, + "learning_rate": 1.2199556884624992e-07, + "loss": 0.0481, + "reward": 1.0022321790456772, + "reward_std": 0.2245309092104435, + "rewards/accuracy_reward": 0.0535714328289032, + "rewards/format_reward": 0.948660746216774, + "step": 3046 + }, + { + "completion_length": 990.9464874267578, + "epoch": 0.910163542677918, + "grad_norm": 1.7322883605957031, + "kl": 0.667724609375, + "learning_rate": 1.2185085414295242e-07, + "loss": 0.0278, + "reward": 1.06026791036129, + "reward_std": 0.2172122709453106, + "rewards/accuracy_reward": 0.09598214970901608, + "rewards/format_reward": 0.964285746216774, + "step": 3047 + }, + { + "completion_length": 853.3460388183594, + "epoch": 0.9104622507654395, + "grad_norm": 0.9910611510276794, + "kl": 0.474609375, + "learning_rate": 1.2170660522396251e-07, + "loss": -0.021, + "reward": 1.2522321939468384, + "reward_std": 0.228055439889431, + "rewards/accuracy_reward": 0.2767857313156128, + "rewards/format_reward": 0.9754464477300644, + "step": 3048 + }, + { + "completion_length": 988.7723846435547, + "epoch": 0.9107609588529609, + "grad_norm": 1.1604489088058472, + "kl": 0.65625, + "learning_rate": 1.2156282224620884e-07, + "loss": 0.021, + "reward": 1.1093750298023224, + "reward_std": 0.2567117903381586, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.9486607611179352, + "step": 3049 + }, + { + "completion_length": 974.9107360839844, + "epoch": 0.9110596669404825, + "grad_norm": 0.9227564930915833, + "kl": 0.6240234375, + "learning_rate": 1.214195053661132e-07, + "loss": 0.027, + "reward": 1.2388393580913544, + "reward_std": 0.2875451184809208, + "rewards/accuracy_reward": 0.2723214365541935, + "rewards/format_reward": 0.9665178954601288, + "step": 3050 + }, + { + "completion_length": 947.810302734375, + "epoch": 0.9113583750280039, + "grad_norm": 1.5785822868347168, + "kl": 0.59716796875, + "learning_rate": 1.212766547395904e-07, + "loss": 0.025, + "reward": 1.0714285969734192, + "reward_std": 0.22253290936350822, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.9531250447034836, + "step": 3051 + }, + { + "completion_length": 924.4777221679688, + "epoch": 0.9116570831155254, + "grad_norm": 0.9322715401649475, + "kl": 0.7109375, + "learning_rate": 1.2113427052204772e-07, + "loss": -0.0042, + "reward": 1.0669642984867096, + "reward_std": 0.24970708414912224, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.9620536118745804, + "step": 3052 + }, + { + "completion_length": 965.4487152099609, + "epoch": 0.9119557912030468, + "grad_norm": 0.9804964661598206, + "kl": 0.47021484375, + "learning_rate": 1.2099235286838544e-07, + "loss": -0.0053, + "reward": 1.178571492433548, + "reward_std": 0.2315022386610508, + "rewards/accuracy_reward": 0.21428572945296764, + "rewards/format_reward": 0.9642857611179352, + "step": 3053 + }, + { + "completion_length": 921.9643249511719, + "epoch": 0.9122544992905683, + "grad_norm": 2.041198253631592, + "kl": 0.81103515625, + "learning_rate": 1.2085090193299593e-07, + "loss": 0.0656, + "reward": 1.0758928805589676, + "reward_std": 0.26902658864855766, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.9397321939468384, + "step": 3054 + }, + { + "completion_length": 1027.19873046875, + "epoch": 0.9125532073780898, + "grad_norm": 1.0695685148239136, + "kl": 0.6455078125, + "learning_rate": 1.2070991786976397e-07, + "loss": 0.0227, + "reward": 1.1428571939468384, + "reward_std": 0.2504858337342739, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.9598214626312256, + "step": 3055 + }, + { + "completion_length": 825.6339569091797, + "epoch": 0.9128519154656113, + "grad_norm": 0.7077593803405762, + "kl": 0.6728515625, + "learning_rate": 1.205694008320665e-07, + "loss": 0.032, + "reward": 1.1986607611179352, + "reward_std": 0.23980672657489777, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.957589328289032, + "step": 3056 + }, + { + "completion_length": 856.5178985595703, + "epoch": 0.9131506235531327, + "grad_norm": 0.651991605758667, + "kl": 0.6494140625, + "learning_rate": 1.2042935097277207e-07, + "loss": 0.0168, + "reward": 1.2455357611179352, + "reward_std": 0.24755601957440376, + "rewards/accuracy_reward": 0.279017873108387, + "rewards/format_reward": 0.96651791036129, + "step": 3057 + }, + { + "completion_length": 943.2388763427734, + "epoch": 0.9134493316406541, + "grad_norm": 1.047898769378662, + "kl": 0.6005859375, + "learning_rate": 1.2028976844424142e-07, + "loss": 0.039, + "reward": 1.0758928805589676, + "reward_std": 0.20162883773446083, + "rewards/accuracy_reward": 0.10714285937137902, + "rewards/format_reward": 0.9687500298023224, + "step": 3058 + }, + { + "completion_length": 975.4643402099609, + "epoch": 0.9137480397281756, + "grad_norm": 1.2470687627792358, + "kl": 0.908203125, + "learning_rate": 1.201506533983266e-07, + "loss": 0.0242, + "reward": 1.1049107909202576, + "reward_std": 0.2999573200941086, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.9553571790456772, + "step": 3059 + }, + { + "completion_length": 761.7969055175781, + "epoch": 0.9140467478156971, + "grad_norm": 0.8577203750610352, + "kl": 0.6796875, + "learning_rate": 1.2001200598637105e-07, + "loss": 0.0058, + "reward": 1.2098215222358704, + "reward_std": 0.2617327459156513, + "rewards/accuracy_reward": 0.2455357201397419, + "rewards/format_reward": 0.9642857611179352, + "step": 3060 + }, + { + "completion_length": 809.4018249511719, + "epoch": 0.9143454559032186, + "grad_norm": 1.5882874727249146, + "kl": 0.74560546875, + "learning_rate": 1.1987382635920966e-07, + "loss": 0.0036, + "reward": 1.1316964626312256, + "reward_std": 0.24727346375584602, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.9709821939468384, + "step": 3061 + }, + { + "completion_length": 891.6764068603516, + "epoch": 0.91464416399074, + "grad_norm": 0.923714280128479, + "kl": 0.81640625, + "learning_rate": 1.197361146671683e-07, + "loss": 0.0314, + "reward": 1.0267857909202576, + "reward_std": 0.19674052856862545, + "rewards/accuracy_reward": 0.06696429033763707, + "rewards/format_reward": 0.9598214775323868, + "step": 3062 + }, + { + "completion_length": 856.6853179931641, + "epoch": 0.9149428720782615, + "grad_norm": 2.549285650253296, + "kl": 0.65234375, + "learning_rate": 1.195988710600638e-07, + "loss": 0.0327, + "reward": 1.1071429252624512, + "reward_std": 0.22423140704631805, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.96651791036129, + "step": 3063 + }, + { + "completion_length": 860.9754943847656, + "epoch": 0.9152415801657829, + "grad_norm": 2.0855765342712402, + "kl": 0.6748046875, + "learning_rate": 1.1946209568720367e-07, + "loss": 0.0172, + "reward": 1.1808036267757416, + "reward_std": 0.22574037313461304, + "rewards/accuracy_reward": 0.2098214328289032, + "rewards/format_reward": 0.9709821939468384, + "step": 3064 + }, + { + "completion_length": 974.2433319091797, + "epoch": 0.9155402882533045, + "grad_norm": 1.204262137413025, + "kl": 0.763671875, + "learning_rate": 1.193257886973863e-07, + "loss": -0.0164, + "reward": 1.1517857611179352, + "reward_std": 0.24415123462677002, + "rewards/accuracy_reward": 0.20089286752045155, + "rewards/format_reward": 0.9508928954601288, + "step": 3065 + }, + { + "completion_length": 854.7120819091797, + "epoch": 0.9158389963408259, + "grad_norm": 3.2687876224517822, + "kl": 0.822265625, + "learning_rate": 1.1918995023890014e-07, + "loss": 0.0575, + "reward": 1.191964328289032, + "reward_std": 0.2842353470623493, + "rewards/accuracy_reward": 0.24107143841683865, + "rewards/format_reward": 0.9508928954601288, + "step": 3066 + }, + { + "completion_length": 851.2500457763672, + "epoch": 0.9161377044283474, + "grad_norm": 1.6827058792114258, + "kl": 0.5478515625, + "learning_rate": 1.1905458045952423e-07, + "loss": 0.0183, + "reward": 1.1629464626312256, + "reward_std": 0.222088985145092, + "rewards/accuracy_reward": 0.2142857313156128, + "rewards/format_reward": 0.948660746216774, + "step": 3067 + }, + { + "completion_length": 893.6920166015625, + "epoch": 0.9164364125158688, + "grad_norm": 1.228715419769287, + "kl": 0.5615234375, + "learning_rate": 1.1891967950652759e-07, + "loss": 0.019, + "reward": 1.100446492433548, + "reward_std": 0.2738700956106186, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.96651791036129, + "step": 3068 + }, + { + "completion_length": 824.4955749511719, + "epoch": 0.9167351206033904, + "grad_norm": 2.024465560913086, + "kl": 0.81201171875, + "learning_rate": 1.1878524752666918e-07, + "loss": 0.0315, + "reward": 1.1383929252624512, + "reward_std": 0.2546297274529934, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.9575893133878708, + "step": 3069 + }, + { + "completion_length": 873.3013916015625, + "epoch": 0.9170338286909118, + "grad_norm": 1.1909656524658203, + "kl": 0.7314453125, + "learning_rate": 1.1865128466619794e-07, + "loss": 0.0326, + "reward": 1.145089328289032, + "reward_std": 0.253474123775959, + "rewards/accuracy_reward": 0.1785714328289032, + "rewards/format_reward": 0.9665178954601288, + "step": 3070 + }, + { + "completion_length": 959.9866638183594, + "epoch": 0.9173325367784333, + "grad_norm": 1.0750454664230347, + "kl": 0.9306640625, + "learning_rate": 1.1851779107085223e-07, + "loss": 0.0261, + "reward": 1.111607164144516, + "reward_std": 0.3260790929198265, + "rewards/accuracy_reward": 0.16517857648432255, + "rewards/format_reward": 0.9464286267757416, + "step": 3071 + }, + { + "completion_length": 989.3638763427734, + "epoch": 0.9176312448659547, + "grad_norm": 0.8707254528999329, + "kl": 0.609375, + "learning_rate": 1.1838476688586005e-07, + "loss": 0.032, + "reward": 0.9933036118745804, + "reward_std": 0.22977178916335106, + "rewards/accuracy_reward": 0.0602678619325161, + "rewards/format_reward": 0.9330357760190964, + "step": 3072 + }, + { + "completion_length": 987.2120971679688, + "epoch": 0.9179299529534762, + "grad_norm": 6.186123371124268, + "kl": 0.83984375, + "learning_rate": 1.1825221225593865e-07, + "loss": 0.0079, + "reward": 1.2142857611179352, + "reward_std": 0.2269633412361145, + "rewards/accuracy_reward": 0.2522321529686451, + "rewards/format_reward": 0.9620536118745804, + "step": 3073 + }, + { + "completion_length": 898.9933624267578, + "epoch": 0.9182286610409977, + "grad_norm": 1.8214856386184692, + "kl": 0.7646484375, + "learning_rate": 1.1812012732529445e-07, + "loss": -0.0003, + "reward": 1.1183036267757416, + "reward_std": 0.2605658918619156, + "rewards/accuracy_reward": 0.1674107275903225, + "rewards/format_reward": 0.9508928954601288, + "step": 3074 + }, + { + "completion_length": 910.3839569091797, + "epoch": 0.9185273691285192, + "grad_norm": 1.8733516931533813, + "kl": 0.6298828125, + "learning_rate": 1.17988512237623e-07, + "loss": 0.0356, + "reward": 1.1674107909202576, + "reward_std": 0.2692366987466812, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.96651791036129, + "step": 3075 + }, + { + "completion_length": 866.6853332519531, + "epoch": 0.9188260772160406, + "grad_norm": 1.4495710134506226, + "kl": 1.14453125, + "learning_rate": 1.1785736713610854e-07, + "loss": 0.0714, + "reward": 1.1741071939468384, + "reward_std": 0.28948382288217545, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.941964328289032, + "step": 3076 + }, + { + "completion_length": 846.2991485595703, + "epoch": 0.9191247853035621, + "grad_norm": 1.8437517881393433, + "kl": 1.091796875, + "learning_rate": 1.177266921634241e-07, + "loss": 0.0128, + "reward": 1.1696429252624512, + "reward_std": 0.2671457715332508, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.9531250447034836, + "step": 3077 + }, + { + "completion_length": 869.2433471679688, + "epoch": 0.9194234933910835, + "grad_norm": 1.6063745021820068, + "kl": 0.7900390625, + "learning_rate": 1.1759648746173125e-07, + "loss": 0.0016, + "reward": 1.051339328289032, + "reward_std": 0.25903138518333435, + "rewards/accuracy_reward": 0.10491072200238705, + "rewards/format_reward": 0.9464286118745804, + "step": 3078 + }, + { + "completion_length": 898.8482666015625, + "epoch": 0.9197222014786051, + "grad_norm": 1.9076340198516846, + "kl": 0.8115234375, + "learning_rate": 1.1746675317267998e-07, + "loss": -0.0039, + "reward": 1.102678656578064, + "reward_std": 0.25433992967009544, + "rewards/accuracy_reward": 0.14732143748551607, + "rewards/format_reward": 0.9553571790456772, + "step": 3079 + }, + { + "completion_length": 899.4464569091797, + "epoch": 0.9200209095661265, + "grad_norm": 1.7615408897399902, + "kl": 0.9150390625, + "learning_rate": 1.1733748943740837e-07, + "loss": 0.0728, + "reward": 1.160714328289032, + "reward_std": 0.3050563298165798, + "rewards/accuracy_reward": 0.21651786379516125, + "rewards/format_reward": 0.9441964626312256, + "step": 3080 + }, + { + "completion_length": 979.6540679931641, + "epoch": 0.920319617653648, + "grad_norm": 2.1693570613861084, + "kl": 0.8193359375, + "learning_rate": 1.1720869639654285e-07, + "loss": 0.0263, + "reward": 1.020089328289032, + "reward_std": 0.2262551486492157, + "rewards/accuracy_reward": 0.0691964291036129, + "rewards/format_reward": 0.9508928954601288, + "step": 3081 + }, + { + "completion_length": 920.4777069091797, + "epoch": 0.9206183257411694, + "grad_norm": 0.9162221550941467, + "kl": 0.77099609375, + "learning_rate": 1.1708037419019751e-07, + "loss": 0.0509, + "reward": 1.1116071939468384, + "reward_std": 0.18772963527590036, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9732143133878708, + "step": 3082 + }, + { + "completion_length": 912.5312957763672, + "epoch": 0.920917033828691, + "grad_norm": 1.659554362297058, + "kl": 0.7958984375, + "learning_rate": 1.1695252295797432e-07, + "loss": 0.0233, + "reward": 1.0691964626312256, + "reward_std": 0.23887574672698975, + "rewards/accuracy_reward": 0.12053572246804833, + "rewards/format_reward": 0.948660746216774, + "step": 3083 + }, + { + "completion_length": 914.013427734375, + "epoch": 0.9212157419162124, + "grad_norm": 1.4263607263565063, + "kl": 0.74267578125, + "learning_rate": 1.1682514283896303e-07, + "loss": -0.0093, + "reward": 1.2165178954601288, + "reward_std": 0.2783673107624054, + "rewards/accuracy_reward": 0.2678571529686451, + "rewards/format_reward": 0.948660746216774, + "step": 3084 + }, + { + "completion_length": 878.7500457763672, + "epoch": 0.9215144500037339, + "grad_norm": 1.3882778882980347, + "kl": 0.61865234375, + "learning_rate": 1.1669823397174055e-07, + "loss": 0.0158, + "reward": 1.1227678954601288, + "reward_std": 0.25453080981969833, + "rewards/accuracy_reward": 0.17633929289877415, + "rewards/format_reward": 0.9464286267757416, + "step": 3085 + }, + { + "completion_length": 873.0982513427734, + "epoch": 0.9218131580912553, + "grad_norm": 2.072330951690674, + "kl": 0.61669921875, + "learning_rate": 1.1657179649437134e-07, + "loss": 0.0673, + "reward": 1.1406250298023224, + "reward_std": 0.2113778293132782, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9687500447034836, + "step": 3086 + }, + { + "completion_length": 949.8995819091797, + "epoch": 0.9221118661787768, + "grad_norm": 1.1898506879806519, + "kl": 0.849609375, + "learning_rate": 1.1644583054440712e-07, + "loss": 0.0661, + "reward": 1.1205357611179352, + "reward_std": 0.2637827917933464, + "rewards/accuracy_reward": 0.16517858393490314, + "rewards/format_reward": 0.9553571939468384, + "step": 3087 + }, + { + "completion_length": 889.1384429931641, + "epoch": 0.9224105742662982, + "grad_norm": 1.4234822988510132, + "kl": 0.85595703125, + "learning_rate": 1.1632033625888633e-07, + "loss": 0.0247, + "reward": 1.0870536267757416, + "reward_std": 0.21386521309614182, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.9709821939468384, + "step": 3088 + }, + { + "completion_length": 972.9978179931641, + "epoch": 0.9227092823538198, + "grad_norm": 1.2415071725845337, + "kl": 0.55908203125, + "learning_rate": 1.1619531377433454e-07, + "loss": -0.0179, + "reward": 1.0915178954601288, + "reward_std": 0.17718024924397469, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.9709821790456772, + "step": 3089 + }, + { + "completion_length": 871.8638763427734, + "epoch": 0.9230079904413412, + "grad_norm": 1.3702242374420166, + "kl": 0.53564453125, + "learning_rate": 1.1607076322676396e-07, + "loss": 0.013, + "reward": 1.0848214626312256, + "reward_std": 0.21263213455677032, + "rewards/accuracy_reward": 0.12053572246804833, + "rewards/format_reward": 0.9642857611179352, + "step": 3090 + }, + { + "completion_length": 913.2835235595703, + "epoch": 0.9233066985288627, + "grad_norm": 1.1258752346038818, + "kl": 0.6103515625, + "learning_rate": 1.1594668475167331e-07, + "loss": 0.0239, + "reward": 1.1026786416769028, + "reward_std": 0.23275811597704887, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.964285746216774, + "step": 3091 + }, + { + "completion_length": 937.7924652099609, + "epoch": 0.9236054066163841, + "grad_norm": 1.090934157371521, + "kl": 0.869140625, + "learning_rate": 1.1582307848404785e-07, + "loss": 0.0371, + "reward": 1.0424107760190964, + "reward_std": 0.29487817734479904, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.9531250447034836, + "step": 3092 + }, + { + "completion_length": 878.8973541259766, + "epoch": 0.9239041147039057, + "grad_norm": 1.2185063362121582, + "kl": 0.732421875, + "learning_rate": 1.1569994455835911e-07, + "loss": 0.0001, + "reward": 1.1473214626312256, + "reward_std": 0.24734698981046677, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9531250298023224, + "step": 3093 + }, + { + "completion_length": 761.7857513427734, + "epoch": 0.9242028227914271, + "grad_norm": 1.8551816940307617, + "kl": 0.517578125, + "learning_rate": 1.1557728310856467e-07, + "loss": 0.0221, + "reward": 1.1696429252624512, + "reward_std": 0.24164282903075218, + "rewards/accuracy_reward": 0.2075892984867096, + "rewards/format_reward": 0.9620536118745804, + "step": 3094 + }, + { + "completion_length": 881.3549652099609, + "epoch": 0.9245015308789486, + "grad_norm": 1.3964641094207764, + "kl": 0.70166015625, + "learning_rate": 1.154550942681083e-07, + "loss": 0.0179, + "reward": 1.0647321939468384, + "reward_std": 0.2138325534760952, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.957589328289032, + "step": 3095 + }, + { + "completion_length": 986.7835235595703, + "epoch": 0.92480023896647, + "grad_norm": 0.8413378596305847, + "kl": 0.7216796875, + "learning_rate": 1.1533337816991931e-07, + "loss": 0.0028, + "reward": 1.066964328289032, + "reward_std": 0.23221705108880997, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.96651791036129, + "step": 3096 + }, + { + "completion_length": 950.3125457763672, + "epoch": 0.9250989470539915, + "grad_norm": 1.0395314693450928, + "kl": 0.73095703125, + "learning_rate": 1.1521213494641294e-07, + "loss": 0.0275, + "reward": 1.0468750447034836, + "reward_std": 0.22502755001187325, + "rewards/accuracy_reward": 0.1049107238650322, + "rewards/format_reward": 0.9419643133878708, + "step": 3097 + }, + { + "completion_length": 968.3080749511719, + "epoch": 0.925397655141513, + "grad_norm": 1.6686254739761353, + "kl": 0.94921875, + "learning_rate": 1.1509136472949004e-07, + "loss": 0.0549, + "reward": 1.0401786267757416, + "reward_std": 0.19900395721197128, + "rewards/accuracy_reward": 0.08258929220028222, + "rewards/format_reward": 0.957589328289032, + "step": 3098 + }, + { + "completion_length": 770.9129791259766, + "epoch": 0.9256963632290345, + "grad_norm": 0.9651997089385986, + "kl": 0.83984375, + "learning_rate": 1.1497106765053663e-07, + "loss": 0.0393, + "reward": 1.1517857611179352, + "reward_std": 0.2160481307655573, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9397321790456772, + "step": 3099 + }, + { + "completion_length": 868.9263763427734, + "epoch": 0.9259950713165559, + "grad_norm": 1.1254959106445312, + "kl": 0.72802734375, + "learning_rate": 1.1485124384042418e-07, + "loss": 0.0237, + "reward": 1.1808035969734192, + "reward_std": 0.21213748678565025, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.9486607611179352, + "step": 3100 + }, + { + "completion_length": 934.1830749511719, + "epoch": 0.9262937794040773, + "grad_norm": 1.6203038692474365, + "kl": 0.630859375, + "learning_rate": 1.1473189342950936e-07, + "loss": 0.0287, + "reward": 1.1741072237491608, + "reward_std": 0.27009523659944534, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.957589328289032, + "step": 3101 + }, + { + "completion_length": 919.9419860839844, + "epoch": 0.9265924874915988, + "grad_norm": 0.794758141040802, + "kl": 0.59130859375, + "learning_rate": 1.1461301654763352e-07, + "loss": 0.0184, + "reward": 1.0758929252624512, + "reward_std": 0.15305563434958458, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.9732143431901932, + "step": 3102 + }, + { + "completion_length": 895.0513763427734, + "epoch": 0.9268911955791203, + "grad_norm": 0.9132658839225769, + "kl": 0.60595703125, + "learning_rate": 1.1449461332412321e-07, + "loss": 0.0198, + "reward": 1.0848214775323868, + "reward_std": 0.23344676941633224, + "rewards/accuracy_reward": 0.11160714831203222, + "rewards/format_reward": 0.973214328289032, + "step": 3103 + }, + { + "completion_length": 944.1830902099609, + "epoch": 0.9271899036666418, + "grad_norm": 1.0762419700622559, + "kl": 0.8857421875, + "learning_rate": 1.1437668388778955e-07, + "loss": 0.0334, + "reward": 1.1473214626312256, + "reward_std": 0.22429049387574196, + "rewards/accuracy_reward": 0.191964291036129, + "rewards/format_reward": 0.9553571790456772, + "step": 3104 + }, + { + "completion_length": 808.0201263427734, + "epoch": 0.9274886117541632, + "grad_norm": 1.123429536819458, + "kl": 0.8017578125, + "learning_rate": 1.1425922836692805e-07, + "loss": -0.0016, + "reward": 1.0915178954601288, + "reward_std": 0.1790735088288784, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.9843750447034836, + "step": 3105 + }, + { + "completion_length": 942.5781707763672, + "epoch": 0.9277873198416847, + "grad_norm": 0.941902220249176, + "kl": 0.6962890625, + "learning_rate": 1.1414224688931896e-07, + "loss": 0.0367, + "reward": 1.1830357611179352, + "reward_std": 0.25528573244810104, + "rewards/accuracy_reward": 0.22767858766019344, + "rewards/format_reward": 0.9553571790456772, + "step": 3106 + }, + { + "completion_length": 922.8437957763672, + "epoch": 0.9280860279292061, + "grad_norm": 1.1015210151672363, + "kl": 0.72802734375, + "learning_rate": 1.1402573958222661e-07, + "loss": -0.0225, + "reward": 1.1138393431901932, + "reward_std": 0.24965764954686165, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.9464286118745804, + "step": 3107 + }, + { + "completion_length": 846.3772735595703, + "epoch": 0.9283847360167277, + "grad_norm": 1.3434596061706543, + "kl": 0.626953125, + "learning_rate": 1.1390970657239948e-07, + "loss": 0.0144, + "reward": 1.1205357611179352, + "reward_std": 0.22274309769272804, + "rewards/accuracy_reward": 0.15178572433069348, + "rewards/format_reward": 0.9687500447034836, + "step": 3108 + }, + { + "completion_length": 955.8170318603516, + "epoch": 0.9286834441042491, + "grad_norm": 2.056769847869873, + "kl": 0.63720703125, + "learning_rate": 1.1379414798607019e-07, + "loss": 0.0637, + "reward": 1.2031250298023224, + "reward_std": 0.3314131461083889, + "rewards/accuracy_reward": 0.2433035857975483, + "rewards/format_reward": 0.9598214626312256, + "step": 3109 + }, + { + "completion_length": 1029.5201416015625, + "epoch": 0.9289821521917706, + "grad_norm": 1.3402882814407349, + "kl": 0.59765625, + "learning_rate": 1.1367906394895511e-07, + "loss": 0.002, + "reward": 1.1071429252624512, + "reward_std": 0.20210502296686172, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.9687500447034836, + "step": 3110 + }, + { + "completion_length": 760.8348541259766, + "epoch": 0.929280860279292, + "grad_norm": 1.7925525903701782, + "kl": 0.6064453125, + "learning_rate": 1.1356445458625436e-07, + "loss": 0.0287, + "reward": 1.0825893580913544, + "reward_std": 0.20596649125218391, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.9687500298023224, + "step": 3111 + }, + { + "completion_length": 882.6808624267578, + "epoch": 0.9295795683668135, + "grad_norm": 1.1197729110717773, + "kl": 0.57080078125, + "learning_rate": 1.1345032002265181e-07, + "loss": 0.0141, + "reward": 1.1361607611179352, + "reward_std": 0.2067993711680174, + "rewards/accuracy_reward": 0.17857143469154835, + "rewards/format_reward": 0.957589328289032, + "step": 3112 + }, + { + "completion_length": 871.3839721679688, + "epoch": 0.929878276454335, + "grad_norm": 0.9422926902770996, + "kl": 0.6513671875, + "learning_rate": 1.1333666038231457e-07, + "loss": 0.0092, + "reward": 1.189732164144516, + "reward_std": 0.22604256309568882, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.9709821939468384, + "step": 3113 + }, + { + "completion_length": 942.3259429931641, + "epoch": 0.9301769845418565, + "grad_norm": 1.4377340078353882, + "kl": 0.8447265625, + "learning_rate": 1.1322347578889322e-07, + "loss": 0.0533, + "reward": 1.1897321939468384, + "reward_std": 0.3010142520070076, + "rewards/accuracy_reward": 0.2388392984867096, + "rewards/format_reward": 0.9508928954601288, + "step": 3114 + }, + { + "completion_length": 959.0111999511719, + "epoch": 0.9304756926293779, + "grad_norm": 0.6247971653938293, + "kl": 0.7177734375, + "learning_rate": 1.131107663655216e-07, + "loss": 0.0282, + "reward": 1.0647321790456772, + "reward_std": 0.16426699236035347, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.9687500447034836, + "step": 3115 + }, + { + "completion_length": 957.7478179931641, + "epoch": 0.9307744007168994, + "grad_norm": 2.5405807495117188, + "kl": 0.6142578125, + "learning_rate": 1.1299853223481634e-07, + "loss": 0.0112, + "reward": 1.2008928954601288, + "reward_std": 0.28325483947992325, + "rewards/accuracy_reward": 0.2633928693830967, + "rewards/format_reward": 0.9375000298023224, + "step": 3116 + }, + { + "completion_length": 806.419677734375, + "epoch": 0.9310731088044208, + "grad_norm": 1.1021109819412231, + "kl": 0.7744140625, + "learning_rate": 1.1288677351887724e-07, + "loss": 0.0383, + "reward": 1.2232143878936768, + "reward_std": 0.2105226144194603, + "rewards/accuracy_reward": 0.2544643022119999, + "rewards/format_reward": 0.9687500298023224, + "step": 3117 + }, + { + "completion_length": 921.7969207763672, + "epoch": 0.9313718168919424, + "grad_norm": 0.7653446793556213, + "kl": 0.57275390625, + "learning_rate": 1.1277549033928693e-07, + "loss": 0.0269, + "reward": 1.1250000596046448, + "reward_std": 0.22662648558616638, + "rewards/accuracy_reward": 0.16741071734577417, + "rewards/format_reward": 0.957589328289032, + "step": 3118 + }, + { + "completion_length": 950.716552734375, + "epoch": 0.9316705249794638, + "grad_norm": 1.0750302076339722, + "kl": 0.56884765625, + "learning_rate": 1.1266468281711048e-07, + "loss": 0.0398, + "reward": 1.1941964626312256, + "reward_std": 0.2584258336573839, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.9709821939468384, + "step": 3119 + }, + { + "completion_length": 890.3794860839844, + "epoch": 0.9319692330669853, + "grad_norm": 1.1982134580612183, + "kl": 0.650390625, + "learning_rate": 1.1255435107289571e-07, + "loss": 0.0331, + "reward": 1.1183036118745804, + "reward_std": 0.19476262107491493, + "rewards/accuracy_reward": 0.15401786309666932, + "rewards/format_reward": 0.964285746216774, + "step": 3120 + }, + { + "completion_length": 919.044677734375, + "epoch": 0.9322679411545067, + "grad_norm": 1.2840309143066406, + "kl": 0.6416015625, + "learning_rate": 1.1244449522667261e-07, + "loss": 0.0296, + "reward": 1.084821492433548, + "reward_std": 0.29636937752366066, + "rewards/accuracy_reward": 0.14062500838190317, + "rewards/format_reward": 0.9441964775323868, + "step": 3121 + }, + { + "completion_length": 918.1362152099609, + "epoch": 0.9325666492420283, + "grad_norm": 1.2223864793777466, + "kl": 0.81396484375, + "learning_rate": 1.1233511539795367e-07, + "loss": 0.0478, + "reward": 1.2053571939468384, + "reward_std": 0.280695166438818, + "rewards/accuracy_reward": 0.2477678656578064, + "rewards/format_reward": 0.9575893133878708, + "step": 3122 + }, + { + "completion_length": 919.6495971679688, + "epoch": 0.9328653573295497, + "grad_norm": 0.7366257905960083, + "kl": 0.65869140625, + "learning_rate": 1.1222621170573346e-07, + "loss": 0.0188, + "reward": 1.113839328289032, + "reward_std": 0.2475901059806347, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.957589328289032, + "step": 3123 + }, + { + "completion_length": 883.3170013427734, + "epoch": 0.9331640654170712, + "grad_norm": 1.2836638689041138, + "kl": 0.61474609375, + "learning_rate": 1.121177842684884e-07, + "loss": 0.039, + "reward": 1.1093750894069672, + "reward_std": 0.2726181894540787, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.941964328289032, + "step": 3124 + }, + { + "completion_length": 950.5379791259766, + "epoch": 0.9334627735045926, + "grad_norm": 1.6309031248092651, + "kl": 1.00146484375, + "learning_rate": 1.1200983320417704e-07, + "loss": 0.0158, + "reward": 1.176339328289032, + "reward_std": 0.2593604810535908, + "rewards/accuracy_reward": 0.2276785857975483, + "rewards/format_reward": 0.9486607611179352, + "step": 3125 + }, + { + "completion_length": 921.7053985595703, + "epoch": 0.9337614815921141, + "grad_norm": 1.0224428176879883, + "kl": 0.6923828125, + "learning_rate": 1.1190235863023949e-07, + "loss": 0.0237, + "reward": 1.116071492433548, + "reward_std": 0.2141517736017704, + "rewards/accuracy_reward": 0.16517858253791928, + "rewards/format_reward": 0.9508928954601288, + "step": 3126 + }, + { + "completion_length": 924.8348541259766, + "epoch": 0.9340601896796356, + "grad_norm": 1.9037957191467285, + "kl": 0.8232421875, + "learning_rate": 1.1179536066359757e-07, + "loss": 0.0112, + "reward": 1.0312500447034836, + "reward_std": 0.23457641154527664, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.9486607611179352, + "step": 3127 + }, + { + "completion_length": 914.2344207763672, + "epoch": 0.9343588977671571, + "grad_norm": 1.2769373655319214, + "kl": 0.609375, + "learning_rate": 1.1168883942065457e-07, + "loss": 0.0524, + "reward": 1.2254464626312256, + "reward_std": 0.24732380360364914, + "rewards/accuracy_reward": 0.2767857201397419, + "rewards/format_reward": 0.948660746216774, + "step": 3128 + }, + { + "completion_length": 920.7500305175781, + "epoch": 0.9346576058546785, + "grad_norm": 1.592655062675476, + "kl": 0.47607421875, + "learning_rate": 1.1158279501729518e-07, + "loss": 0.0214, + "reward": 1.1004464626312256, + "reward_std": 0.2533307299017906, + "rewards/accuracy_reward": 0.13839286426082253, + "rewards/format_reward": 0.9620536118745804, + "step": 3129 + }, + { + "completion_length": 873.1786193847656, + "epoch": 0.9349563139422, + "grad_norm": 1.017015814781189, + "kl": 0.6103515625, + "learning_rate": 1.1147722756888528e-07, + "loss": -0.0177, + "reward": 1.0781250596046448, + "reward_std": 0.26465777680277824, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.948660746216774, + "step": 3130 + }, + { + "completion_length": 933.6317443847656, + "epoch": 0.9352550220297214, + "grad_norm": 1.515121340751648, + "kl": 0.48828125, + "learning_rate": 1.1137213719027196e-07, + "loss": 0.0371, + "reward": 1.0290178805589676, + "reward_std": 0.23184329085052013, + "rewards/accuracy_reward": 0.07589286100119352, + "rewards/format_reward": 0.9531250447034836, + "step": 3131 + }, + { + "completion_length": 935.1295013427734, + "epoch": 0.935553730117243, + "grad_norm": 0.7542113065719604, + "kl": 0.52294921875, + "learning_rate": 1.1126752399578324e-07, + "loss": 0.0037, + "reward": 1.0558036267757416, + "reward_std": 0.2585061639547348, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.957589328289032, + "step": 3132 + }, + { + "completion_length": 895.1428833007812, + "epoch": 0.9358524382047644, + "grad_norm": 1.1473079919815063, + "kl": 0.560546875, + "learning_rate": 1.11163388099228e-07, + "loss": 0.0342, + "reward": 1.225446492433548, + "reward_std": 0.24420179799199104, + "rewards/accuracy_reward": 0.25223215855658054, + "rewards/format_reward": 0.973214328289032, + "step": 3133 + }, + { + "completion_length": 829.6674499511719, + "epoch": 0.9361511462922859, + "grad_norm": 1.49228835105896, + "kl": 0.47119140625, + "learning_rate": 1.1105972961389592e-07, + "loss": 0.0183, + "reward": 1.1272321939468384, + "reward_std": 0.21981102600693703, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.9620536267757416, + "step": 3134 + }, + { + "completion_length": 863.9799499511719, + "epoch": 0.9364498543798073, + "grad_norm": 2.0762603282928467, + "kl": 0.8115234375, + "learning_rate": 1.1095654865255717e-07, + "loss": 0.0378, + "reward": 1.0937500596046448, + "reward_std": 0.24361960217356682, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.9464286267757416, + "step": 3135 + }, + { + "completion_length": 826.1741485595703, + "epoch": 0.9367485624673288, + "grad_norm": 1.7212753295898438, + "kl": 0.771484375, + "learning_rate": 1.1085384532746265e-07, + "loss": 0.0425, + "reward": 1.1473214626312256, + "reward_std": 0.24325961619615555, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.948660746216774, + "step": 3136 + }, + { + "completion_length": 878.4911193847656, + "epoch": 0.9370472705548503, + "grad_norm": 1.0893559455871582, + "kl": 0.677734375, + "learning_rate": 1.1075161975034348e-07, + "loss": 0.0204, + "reward": 1.191964328289032, + "reward_std": 0.2282416969537735, + "rewards/accuracy_reward": 0.22991072619333863, + "rewards/format_reward": 0.9620536118745804, + "step": 3137 + }, + { + "completion_length": 904.5290679931641, + "epoch": 0.9373459786423718, + "grad_norm": 1.216951608657837, + "kl": 0.578125, + "learning_rate": 1.10649872032411e-07, + "loss": 0.0063, + "reward": 1.1183036267757416, + "reward_std": 0.2163324821740389, + "rewards/accuracy_reward": 0.1428571459837258, + "rewards/format_reward": 0.9754464775323868, + "step": 3138 + }, + { + "completion_length": 930.9107666015625, + "epoch": 0.9376446867298932, + "grad_norm": 1.0987963676452637, + "kl": 0.59130859375, + "learning_rate": 1.1054860228435685e-07, + "loss": 0.0535, + "reward": 1.1183036416769028, + "reward_std": 0.2480652891099453, + "rewards/accuracy_reward": 0.16517857927829027, + "rewards/format_reward": 0.9531250447034836, + "step": 3139 + }, + { + "completion_length": 946.8103179931641, + "epoch": 0.9379433948174147, + "grad_norm": 0.952965259552002, + "kl": 0.55029296875, + "learning_rate": 1.1044781061635259e-07, + "loss": 0.0137, + "reward": 1.0758928954601288, + "reward_std": 0.1512673981487751, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.9776786267757416, + "step": 3140 + }, + { + "completion_length": 901.9844207763672, + "epoch": 0.9382421029049361, + "grad_norm": 0.9425640106201172, + "kl": 0.552734375, + "learning_rate": 1.1034749713804957e-07, + "loss": 0.0086, + "reward": 1.0982143431901932, + "reward_std": 0.1704837568104267, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.9687500447034836, + "step": 3141 + }, + { + "completion_length": 931.1897735595703, + "epoch": 0.9385408109924577, + "grad_norm": 0.6173871755599976, + "kl": 0.67041015625, + "learning_rate": 1.1024766195857908e-07, + "loss": 0.0225, + "reward": 1.1696428954601288, + "reward_std": 0.24251359142363071, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.957589328289032, + "step": 3142 + }, + { + "completion_length": 996.5848846435547, + "epoch": 0.9388395190799791, + "grad_norm": 0.9260808229446411, + "kl": 0.68359375, + "learning_rate": 1.1014830518655207e-07, + "loss": -0.0082, + "reward": 1.0937500596046448, + "reward_std": 0.26099447533488274, + "rewards/accuracy_reward": 0.14062500186264515, + "rewards/format_reward": 0.9531250447034836, + "step": 3143 + }, + { + "completion_length": 905.3750305175781, + "epoch": 0.9391382271675005, + "grad_norm": 1.5026791095733643, + "kl": 0.89453125, + "learning_rate": 1.100494269300589e-07, + "loss": 0.0785, + "reward": 1.1205357611179352, + "reward_std": 0.22560958936810493, + "rewards/accuracy_reward": 0.16071429196745157, + "rewards/format_reward": 0.9598214626312256, + "step": 3144 + }, + { + "completion_length": 978.2835235595703, + "epoch": 0.939436935255022, + "grad_norm": 0.991927444934845, + "kl": 0.5224609375, + "learning_rate": 1.0995102729666937e-07, + "loss": 0.0272, + "reward": 1.066964328289032, + "reward_std": 0.28662073612213135, + "rewards/accuracy_reward": 0.11160714784637094, + "rewards/format_reward": 0.9553571790456772, + "step": 3145 + }, + { + "completion_length": 806.9777069091797, + "epoch": 0.9397356433425434, + "grad_norm": 3.64662504196167, + "kl": 0.73388671875, + "learning_rate": 1.0985310639343281e-07, + "loss": 0.0676, + "reward": 1.1607143580913544, + "reward_std": 0.15896900556981564, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.9687500596046448, + "step": 3146 + }, + { + "completion_length": 877.5558319091797, + "epoch": 0.940034351430065, + "grad_norm": 1.6249247789382935, + "kl": 0.646484375, + "learning_rate": 1.0975566432687742e-07, + "loss": 0.0519, + "reward": 1.178571492433548, + "reward_std": 0.2452460117638111, + "rewards/accuracy_reward": 0.216517873108387, + "rewards/format_reward": 0.9620536118745804, + "step": 3147 + }, + { + "completion_length": 854.8170013427734, + "epoch": 0.9403330595175864, + "grad_norm": 13.900392532348633, + "kl": 1.2939453125, + "learning_rate": 1.0965870120301068e-07, + "loss": 0.0939, + "reward": 1.0736607611179352, + "reward_std": 0.21959906816482544, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.948660746216774, + "step": 3148 + }, + { + "completion_length": 986.0781860351562, + "epoch": 0.9406317676051079, + "grad_norm": 1.5042099952697754, + "kl": 0.7802734375, + "learning_rate": 1.0956221712731892e-07, + "loss": 0.0468, + "reward": 1.1138393580913544, + "reward_std": 0.26196810975670815, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9620535969734192, + "step": 3149 + }, + { + "completion_length": 838.2411193847656, + "epoch": 0.9409304756926293, + "grad_norm": 1.2453969717025757, + "kl": 0.5888671875, + "learning_rate": 1.0946621220476737e-07, + "loss": 0.0382, + "reward": 1.2142857909202576, + "reward_std": 0.284506157040596, + "rewards/accuracy_reward": 0.2544642947614193, + "rewards/format_reward": 0.9598214775323868, + "step": 3150 + }, + { + "completion_length": 968.9620971679688, + "epoch": 0.9412291837801509, + "grad_norm": 1.953476905822754, + "kl": 0.51513671875, + "learning_rate": 1.0937068653980005e-07, + "loss": -0.0058, + "reward": 1.1183036267757416, + "reward_std": 0.2559252381324768, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.9620535969734192, + "step": 3151 + }, + { + "completion_length": 936.6250610351562, + "epoch": 0.9415278918676723, + "grad_norm": 0.9149961471557617, + "kl": 0.5859375, + "learning_rate": 1.0927564023633935e-07, + "loss": 0.0055, + "reward": 1.1473214775323868, + "reward_std": 0.18955985456705093, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.9687500298023224, + "step": 3152 + }, + { + "completion_length": 937.6741638183594, + "epoch": 0.9418265999551938, + "grad_norm": 0.7031040191650391, + "kl": 0.5166015625, + "learning_rate": 1.0918107339778654e-07, + "loss": 0.0183, + "reward": 1.0379464626312256, + "reward_std": 0.18369808234274387, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.9687500447034836, + "step": 3153 + }, + { + "completion_length": 977.1986999511719, + "epoch": 0.9421253080427152, + "grad_norm": 1.1160272359848022, + "kl": 0.482421875, + "learning_rate": 1.0908698612702097e-07, + "loss": 0.0072, + "reward": 1.0892857611179352, + "reward_std": 0.1611602734774351, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.9754464626312256, + "step": 3154 + }, + { + "completion_length": 1025.4598693847656, + "epoch": 0.9424240161302367, + "grad_norm": 1.0906550884246826, + "kl": 0.5625, + "learning_rate": 1.0899337852640033e-07, + "loss": 0.0035, + "reward": 1.1272321939468384, + "reward_std": 0.29705996811389923, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.948660746216774, + "step": 3155 + }, + { + "completion_length": 922.2455749511719, + "epoch": 0.9427227242177582, + "grad_norm": 0.9251367449760437, + "kl": 0.75390625, + "learning_rate": 1.0890025069776055e-07, + "loss": 0.0335, + "reward": 1.131696492433548, + "reward_std": 0.2916071079671383, + "rewards/accuracy_reward": 0.17633929289877415, + "rewards/format_reward": 0.9553571939468384, + "step": 3156 + }, + { + "completion_length": 856.8817291259766, + "epoch": 0.9430214323052797, + "grad_norm": 1.131180763244629, + "kl": 0.56103515625, + "learning_rate": 1.0880760274241567e-07, + "loss": 0.0398, + "reward": 1.160714328289032, + "reward_std": 0.23533843830227852, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.9598214775323868, + "step": 3157 + }, + { + "completion_length": 927.1250305175781, + "epoch": 0.9433201403928011, + "grad_norm": 1.2608259916305542, + "kl": 0.5478515625, + "learning_rate": 1.0871543476115742e-07, + "loss": 0.0149, + "reward": 1.0758929252624512, + "reward_std": 0.23972734063863754, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.9575893133878708, + "step": 3158 + }, + { + "completion_length": 873.966552734375, + "epoch": 0.9436188484803226, + "grad_norm": 1.3312281370162964, + "kl": 0.564453125, + "learning_rate": 1.0862374685425562e-07, + "loss": 0.0289, + "reward": 1.2053571939468384, + "reward_std": 0.23258964717388153, + "rewards/accuracy_reward": 0.23883929662406445, + "rewards/format_reward": 0.9665178805589676, + "step": 3159 + }, + { + "completion_length": 900.2857513427734, + "epoch": 0.943917556567844, + "grad_norm": 0.9908711314201355, + "kl": 0.470458984375, + "learning_rate": 1.0853253912145777e-07, + "loss": 0.0247, + "reward": 1.113839328289032, + "reward_std": 0.1561651909723878, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.9687500298023224, + "step": 3160 + }, + { + "completion_length": 870.0067291259766, + "epoch": 0.9442162646553656, + "grad_norm": 4.41726541519165, + "kl": 0.6923828125, + "learning_rate": 1.0844181166198886e-07, + "loss": 0.0422, + "reward": 1.1540178656578064, + "reward_std": 0.2451336272060871, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.973214328289032, + "step": 3161 + }, + { + "completion_length": 968.5178985595703, + "epoch": 0.944514972742887, + "grad_norm": 1.3874417543411255, + "kl": 0.64306640625, + "learning_rate": 1.0835156457455151e-07, + "loss": 0.0386, + "reward": 1.1160714626312256, + "reward_std": 0.30423733964562416, + "rewards/accuracy_reward": 0.1696428693830967, + "rewards/format_reward": 0.9464286118745804, + "step": 3162 + }, + { + "completion_length": 973.1585388183594, + "epoch": 0.9448136808304085, + "grad_norm": 1.5797300338745117, + "kl": 0.69677734375, + "learning_rate": 1.0826179795732574e-07, + "loss": 0.0334, + "reward": 1.0870536118745804, + "reward_std": 0.19932296872138977, + "rewards/accuracy_reward": 0.12500000488944352, + "rewards/format_reward": 0.9620536267757416, + "step": 3163 + }, + { + "completion_length": 836.2388916015625, + "epoch": 0.9451123889179299, + "grad_norm": 1.1458815336227417, + "kl": 0.8271484375, + "learning_rate": 1.0817251190796875e-07, + "loss": 0.066, + "reward": 1.131696492433548, + "reward_std": 0.2535821311175823, + "rewards/accuracy_reward": 0.16964286286383867, + "rewards/format_reward": 0.9620536118745804, + "step": 3164 + }, + { + "completion_length": 883.0312805175781, + "epoch": 0.9454110970054515, + "grad_norm": 1.3377342224121094, + "kl": 0.74365234375, + "learning_rate": 1.080837065236151e-07, + "loss": -0.0047, + "reward": 1.042410746216774, + "reward_std": 0.25084077194333076, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.9531250447034836, + "step": 3165 + }, + { + "completion_length": 934.7946929931641, + "epoch": 0.9457098050929729, + "grad_norm": 0.7036731243133545, + "kl": 0.92578125, + "learning_rate": 1.0799538190087624e-07, + "loss": 0.0636, + "reward": 1.0468750298023224, + "reward_std": 0.21981923654675484, + "rewards/accuracy_reward": 0.08928572060540318, + "rewards/format_reward": 0.957589328289032, + "step": 3166 + }, + { + "completion_length": 1017.4063110351562, + "epoch": 0.9460085131804944, + "grad_norm": 1.4784128665924072, + "kl": 0.783203125, + "learning_rate": 1.0790753813584083e-07, + "loss": 0.0081, + "reward": 1.0468750149011612, + "reward_std": 0.20800766721367836, + "rewards/accuracy_reward": 0.09151786030270159, + "rewards/format_reward": 0.955357164144516, + "step": 3167 + }, + { + "completion_length": 952.7835388183594, + "epoch": 0.9463072212680158, + "grad_norm": 2.0849149227142334, + "kl": 0.73779296875, + "learning_rate": 1.0782017532407418e-07, + "loss": 0.0174, + "reward": 1.0937500596046448, + "reward_std": 0.23690641671419144, + "rewards/accuracy_reward": 0.12723214784637094, + "rewards/format_reward": 0.9665178954601288, + "step": 3168 + }, + { + "completion_length": 786.4085083007812, + "epoch": 0.9466059293555373, + "grad_norm": 1.2350714206695557, + "kl": 0.63818359375, + "learning_rate": 1.0773329356061848e-07, + "loss": 0.0344, + "reward": 1.2321428954601288, + "reward_std": 0.25918396189808846, + "rewards/accuracy_reward": 0.2589285895228386, + "rewards/format_reward": 0.973214328289032, + "step": 3169 + }, + { + "completion_length": 924.8214569091797, + "epoch": 0.9469046374430587, + "grad_norm": 1.3883312940597534, + "kl": 0.7685546875, + "learning_rate": 1.0764689293999263e-07, + "loss": 0.0108, + "reward": 1.0111607760190964, + "reward_std": 0.18761274218559265, + "rewards/accuracy_reward": 0.046875003492459655, + "rewards/format_reward": 0.964285746216774, + "step": 3170 + }, + { + "completion_length": 949.3170013427734, + "epoch": 0.9472033455305803, + "grad_norm": 0.9474197030067444, + "kl": 0.64892578125, + "learning_rate": 1.0756097355619198e-07, + "loss": 0.0181, + "reward": 1.0758928954601288, + "reward_std": 0.25538910180330276, + "rewards/accuracy_reward": 0.12500000838190317, + "rewards/format_reward": 0.9508928954601288, + "step": 3171 + }, + { + "completion_length": 881.8928985595703, + "epoch": 0.9475020536181017, + "grad_norm": 0.730702817440033, + "kl": 0.6064453125, + "learning_rate": 1.074755355026884e-07, + "loss": 0.029, + "reward": 1.120535746216774, + "reward_std": 0.16740718111395836, + "rewards/accuracy_reward": 0.1517857222352177, + "rewards/format_reward": 0.9687500447034836, + "step": 3172 + }, + { + "completion_length": 765.5245819091797, + "epoch": 0.9478007617056232, + "grad_norm": 1.4639389514923096, + "kl": 0.791015625, + "learning_rate": 1.0739057887243013e-07, + "loss": 0.0661, + "reward": 1.176339328289032, + "reward_std": 0.2866236977279186, + "rewards/accuracy_reward": 0.2276785857975483, + "rewards/format_reward": 0.948660746216774, + "step": 3173 + }, + { + "completion_length": 906.2187957763672, + "epoch": 0.9480994697931446, + "grad_norm": 1.3492239713668823, + "kl": 0.6083984375, + "learning_rate": 1.0730610375784167e-07, + "loss": -0.0007, + "reward": 1.147321492433548, + "reward_std": 0.26337410137057304, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.95089291036129, + "step": 3174 + }, + { + "completion_length": 832.1942443847656, + "epoch": 0.9483981778806662, + "grad_norm": 0.9423664212226868, + "kl": 0.64453125, + "learning_rate": 1.0722211025082367e-07, + "loss": 0.0046, + "reward": 1.2544643580913544, + "reward_std": 0.238620325922966, + "rewards/accuracy_reward": 0.2790178656578064, + "rewards/format_reward": 0.9754464477300644, + "step": 3175 + }, + { + "completion_length": 892.1272735595703, + "epoch": 0.9486968859681876, + "grad_norm": 0.9130808115005493, + "kl": 0.69140625, + "learning_rate": 1.0713859844275286e-07, + "loss": 0.0001, + "reward": 1.1026786267757416, + "reward_std": 0.2902056947350502, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.9531250447034836, + "step": 3176 + }, + { + "completion_length": 912.1094207763672, + "epoch": 0.9489955940557091, + "grad_norm": 1.1041046380996704, + "kl": 0.7119140625, + "learning_rate": 1.070555684244818e-07, + "loss": 0.0082, + "reward": 1.1250000596046448, + "reward_std": 0.2762252725660801, + "rewards/accuracy_reward": 0.17410715157166123, + "rewards/format_reward": 0.9508928954601288, + "step": 3177 + }, + { + "completion_length": 861.450927734375, + "epoch": 0.9492943021432305, + "grad_norm": 0.9254154562950134, + "kl": 0.582763671875, + "learning_rate": 1.0697302028633907e-07, + "loss": 0.0018, + "reward": 1.1406250596046448, + "reward_std": 0.24158363789319992, + "rewards/accuracy_reward": 0.17187500465661287, + "rewards/format_reward": 0.9687500596046448, + "step": 3178 + }, + { + "completion_length": 871.2455749511719, + "epoch": 0.949593010230752, + "grad_norm": 0.8070911765098572, + "kl": 0.517578125, + "learning_rate": 1.0689095411812898e-07, + "loss": 0.0229, + "reward": 1.0647321790456772, + "reward_std": 0.13624844700098038, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.9776785969734192, + "step": 3179 + }, + { + "completion_length": 836.5558166503906, + "epoch": 0.9498917183182735, + "grad_norm": 1.4088091850280762, + "kl": 0.734375, + "learning_rate": 1.0680937000913143e-07, + "loss": 0.0219, + "reward": 1.194196492433548, + "reward_std": 0.22314468398690224, + "rewards/accuracy_reward": 0.2254464365541935, + "rewards/format_reward": 0.9687500447034836, + "step": 3180 + }, + { + "completion_length": 911.4710388183594, + "epoch": 0.950190426405795, + "grad_norm": 1.4100896120071411, + "kl": 0.7216796875, + "learning_rate": 1.0672826804810203e-07, + "loss": 0.024, + "reward": 1.1919643580913544, + "reward_std": 0.296310693025589, + "rewards/accuracy_reward": 0.2343750037252903, + "rewards/format_reward": 0.957589328289032, + "step": 3181 + }, + { + "completion_length": 843.7053985595703, + "epoch": 0.9504891344933164, + "grad_norm": 1.783893346786499, + "kl": 0.7275390625, + "learning_rate": 1.0664764832327159e-07, + "loss": 0.0029, + "reward": 1.176339328289032, + "reward_std": 0.22363422065973282, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.9575893133878708, + "step": 3182 + }, + { + "completion_length": 963.5960083007812, + "epoch": 0.9507878425808379, + "grad_norm": 1.2935237884521484, + "kl": 0.658203125, + "learning_rate": 1.0656751092234664e-07, + "loss": 0.02, + "reward": 1.1763393580913544, + "reward_std": 0.22735648043453693, + "rewards/accuracy_reward": 0.21875000931322575, + "rewards/format_reward": 0.957589328289032, + "step": 3183 + }, + { + "completion_length": 849.8326110839844, + "epoch": 0.9510865506683593, + "grad_norm": 1.640148401260376, + "kl": 0.79296875, + "learning_rate": 1.0648785593250875e-07, + "loss": 0.0479, + "reward": 1.1517857760190964, + "reward_std": 0.32586781308054924, + "rewards/accuracy_reward": 0.20089287124574184, + "rewards/format_reward": 0.9508928954601288, + "step": 3184 + }, + { + "completion_length": 955.8103179931641, + "epoch": 0.9513852587558809, + "grad_norm": 1.6356383562088013, + "kl": 0.69287109375, + "learning_rate": 1.0640868344041473e-07, + "loss": 0.0109, + "reward": 1.127232164144516, + "reward_std": 0.3140389509499073, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.9375000447034836, + "step": 3185 + }, + { + "completion_length": 931.4442291259766, + "epoch": 0.9516839668434023, + "grad_norm": 1.442939281463623, + "kl": 0.82568359375, + "learning_rate": 1.0632999353219652e-07, + "loss": 0.0099, + "reward": 1.0714286267757416, + "reward_std": 0.3039420545101166, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.9441964775323868, + "step": 3186 + }, + { + "completion_length": 867.2522735595703, + "epoch": 0.9519826749309237, + "grad_norm": 1.6959435939788818, + "kl": 0.7685546875, + "learning_rate": 1.0625178629346103e-07, + "loss": 0.0519, + "reward": 1.0758928954601288, + "reward_std": 0.23341775313019753, + "rewards/accuracy_reward": 0.11607143748551607, + "rewards/format_reward": 0.9598214626312256, + "step": 3187 + }, + { + "completion_length": 828.7411041259766, + "epoch": 0.9522813830184452, + "grad_norm": 33.023826599121094, + "kl": 1.0849609375, + "learning_rate": 1.0617406180929002e-07, + "loss": 0.108, + "reward": 1.129464328289032, + "reward_std": 0.2862798720598221, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.964285746216774, + "step": 3188 + }, + { + "completion_length": 999.7210235595703, + "epoch": 0.9525800911059666, + "grad_norm": 1.5978436470031738, + "kl": 0.51708984375, + "learning_rate": 1.0609682016424013e-07, + "loss": 0.0153, + "reward": 1.176339328289032, + "reward_std": 0.21903828904032707, + "rewards/accuracy_reward": 0.2209821529686451, + "rewards/format_reward": 0.9553571790456772, + "step": 3189 + }, + { + "completion_length": 874.5379943847656, + "epoch": 0.9528787991934882, + "grad_norm": 1.1120905876159668, + "kl": 0.6533203125, + "learning_rate": 1.0602006144234274e-07, + "loss": 0.0062, + "reward": 1.1584821939468384, + "reward_std": 0.1547179277986288, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.973214328289032, + "step": 3190 + }, + { + "completion_length": 911.3393096923828, + "epoch": 0.9531775072810096, + "grad_norm": 0.7268274426460266, + "kl": 0.55517578125, + "learning_rate": 1.059437857271038e-07, + "loss": 0.016, + "reward": 1.2031250298023224, + "reward_std": 0.21118130162358284, + "rewards/accuracy_reward": 0.22991072200238705, + "rewards/format_reward": 0.9732143431901932, + "step": 3191 + }, + { + "completion_length": 925.3460388183594, + "epoch": 0.9534762153685311, + "grad_norm": 1.3200371265411377, + "kl": 0.703125, + "learning_rate": 1.0586799310150379e-07, + "loss": 0.0198, + "reward": 1.1718750596046448, + "reward_std": 0.2744956985116005, + "rewards/accuracy_reward": 0.2120535783469677, + "rewards/format_reward": 0.9598214626312256, + "step": 3192 + }, + { + "completion_length": 964.3259429931641, + "epoch": 0.9537749234560525, + "grad_norm": 0.8487732410430908, + "kl": 0.60400390625, + "learning_rate": 1.0579268364799769e-07, + "loss": 0.006, + "reward": 1.1406250596046448, + "reward_std": 0.22469691932201385, + "rewards/accuracy_reward": 0.17857143096625805, + "rewards/format_reward": 0.9620536118745804, + "step": 3193 + }, + { + "completion_length": 880.0915679931641, + "epoch": 0.954073631543574, + "grad_norm": 0.8433460593223572, + "kl": 0.6943359375, + "learning_rate": 1.0571785744851472e-07, + "loss": 0.0119, + "reward": 1.145089328289032, + "reward_std": 0.24300581216812134, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9508928805589676, + "step": 3194 + }, + { + "completion_length": 915.5312805175781, + "epoch": 0.9543723396310955, + "grad_norm": 0.8529036045074463, + "kl": 0.6533203125, + "learning_rate": 1.056435145844586e-07, + "loss": 0.0463, + "reward": 1.2589286267757416, + "reward_std": 0.29537585377693176, + "rewards/accuracy_reward": 0.305803582072258, + "rewards/format_reward": 0.9531250447034836, + "step": 3195 + }, + { + "completion_length": 967.4955749511719, + "epoch": 0.954671047718617, + "grad_norm": 0.9748793244361877, + "kl": 0.57666015625, + "learning_rate": 1.0556965513670694e-07, + "loss": 0.0241, + "reward": 1.069196492433548, + "reward_std": 0.19607162103056908, + "rewards/accuracy_reward": 0.09598215110599995, + "rewards/format_reward": 0.973214328289032, + "step": 3196 + }, + { + "completion_length": 910.825927734375, + "epoch": 0.9549697558061384, + "grad_norm": 0.9953001141548157, + "kl": 0.658203125, + "learning_rate": 1.0549627918561161e-07, + "loss": 0.009, + "reward": 1.2008929252624512, + "reward_std": 0.230307936668396, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.9709821939468384, + "step": 3197 + }, + { + "completion_length": 987.5045166015625, + "epoch": 0.9552684638936599, + "grad_norm": 0.8947663903236389, + "kl": 0.48486328125, + "learning_rate": 1.0542338681099859e-07, + "loss": 0.0134, + "reward": 1.116071492433548, + "reward_std": 0.20548776909708977, + "rewards/accuracy_reward": 0.1718750037252903, + "rewards/format_reward": 0.9441964775323868, + "step": 3198 + }, + { + "completion_length": 884.1652221679688, + "epoch": 0.9555671719811814, + "grad_norm": 1.3457268476486206, + "kl": 0.7587890625, + "learning_rate": 1.0535097809216743e-07, + "loss": 0.0391, + "reward": 1.1964286267757416, + "reward_std": 0.2427572775632143, + "rewards/accuracy_reward": 0.2366071566939354, + "rewards/format_reward": 0.9598214626312256, + "step": 3199 + }, + { + "completion_length": 1002.5067291259766, + "epoch": 0.9558658800687029, + "grad_norm": 0.9570949673652649, + "kl": 0.6318359375, + "learning_rate": 1.0527905310789185e-07, + "loss": 0.0327, + "reward": 1.0781250596046448, + "reward_std": 0.20061932876706123, + "rewards/accuracy_reward": 0.12276786682195961, + "rewards/format_reward": 0.9553571790456772, + "step": 3200 + }, + { + "completion_length": 1001.1228332519531, + "epoch": 0.9561645881562243, + "grad_norm": 1.7972689867019653, + "kl": 0.7705078125, + "learning_rate": 1.0520761193641912e-07, + "loss": -0.0072, + "reward": 1.0468750596046448, + "reward_std": 0.3026518449187279, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.9285714775323868, + "step": 3201 + }, + { + "completion_length": 927.5803680419922, + "epoch": 0.9564632962437458, + "grad_norm": 0.7166677117347717, + "kl": 0.6796875, + "learning_rate": 1.051366546554703e-07, + "loss": 0.0096, + "reward": 1.116071492433548, + "reward_std": 0.2464628778398037, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.9598214775323868, + "step": 3202 + }, + { + "completion_length": 854.6786193847656, + "epoch": 0.9567620043312672, + "grad_norm": 1.3846322298049927, + "kl": 0.4091796875, + "learning_rate": 1.050661813422399e-07, + "loss": 0.0359, + "reward": 1.2611607909202576, + "reward_std": 0.28699592500925064, + "rewards/accuracy_reward": 0.2991071566939354, + "rewards/format_reward": 0.9620536118745804, + "step": 3203 + }, + { + "completion_length": 967.3482666015625, + "epoch": 0.9570607124187888, + "grad_norm": 0.9868643283843994, + "kl": 0.60205078125, + "learning_rate": 1.0499619207339604e-07, + "loss": 0.0406, + "reward": 1.1071428954601288, + "reward_std": 0.21063581109046936, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.973214328289032, + "step": 3204 + }, + { + "completion_length": 982.0848693847656, + "epoch": 0.9573594205063102, + "grad_norm": 1.1002459526062012, + "kl": 0.87890625, + "learning_rate": 1.0492668692508011e-07, + "loss": 0.0457, + "reward": 1.2165179252624512, + "reward_std": 0.3068091832101345, + "rewards/accuracy_reward": 0.2656250037252903, + "rewards/format_reward": 0.9508928805589676, + "step": 3205 + }, + { + "completion_length": 850.4464721679688, + "epoch": 0.9576581285938317, + "grad_norm": 0.9938891530036926, + "kl": 0.48974609375, + "learning_rate": 1.0485766597290697e-07, + "loss": 0.0361, + "reward": 1.0066964775323868, + "reward_std": 0.1781723741441965, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.9642857611179352, + "step": 3206 + }, + { + "completion_length": 970.8616485595703, + "epoch": 0.9579568366813531, + "grad_norm": 0.7640122175216675, + "kl": 0.7626953125, + "learning_rate": 1.0478912929196455e-07, + "loss": 0.0146, + "reward": 1.1026785969734192, + "reward_std": 0.19482001289725304, + "rewards/accuracy_reward": 0.12946429289877415, + "rewards/format_reward": 0.9732143431901932, + "step": 3207 + }, + { + "completion_length": 908.8303833007812, + "epoch": 0.9582555447688746, + "grad_norm": 0.7552157044410706, + "kl": 0.50634765625, + "learning_rate": 1.0472107695681412e-07, + "loss": 0.0212, + "reward": 1.1361607909202576, + "reward_std": 0.20521723851561546, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.9687500298023224, + "step": 3208 + }, + { + "completion_length": 989.3995971679688, + "epoch": 0.9585542528563961, + "grad_norm": 0.9208122491836548, + "kl": 0.6630859375, + "learning_rate": 1.0465350904148996e-07, + "loss": 0.0225, + "reward": 1.2008929252624512, + "reward_std": 0.21948442608118057, + "rewards/accuracy_reward": 0.23214287031441927, + "rewards/format_reward": 0.9687500298023224, + "step": 3209 + }, + { + "completion_length": 903.9040374755859, + "epoch": 0.9588529609439176, + "grad_norm": 0.9856664538383484, + "kl": 0.4990234375, + "learning_rate": 1.0458642561949932e-07, + "loss": 0.0253, + "reward": 1.1205357611179352, + "reward_std": 0.16991296783089638, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9598214626312256, + "step": 3210 + }, + { + "completion_length": 998.6495819091797, + "epoch": 0.959151669031439, + "grad_norm": 1.7609902620315552, + "kl": 0.423828125, + "learning_rate": 1.0451982676382239e-07, + "loss": 0.0064, + "reward": 1.1272321939468384, + "reward_std": 0.25192130729556084, + "rewards/accuracy_reward": 0.16517858393490314, + "rewards/format_reward": 0.9620535969734192, + "step": 3211 + }, + { + "completion_length": 991.7076416015625, + "epoch": 0.9594503771189605, + "grad_norm": 1.0347191095352173, + "kl": 0.6201171875, + "learning_rate": 1.0445371254691217e-07, + "loss": 0.0734, + "reward": 1.1651786267757416, + "reward_std": 0.28556786850094795, + "rewards/accuracy_reward": 0.20312500558793545, + "rewards/format_reward": 0.9620536118745804, + "step": 3212 + }, + { + "completion_length": 980.2388763427734, + "epoch": 0.9597490852064819, + "grad_norm": 1.0548213720321655, + "kl": 0.54931640625, + "learning_rate": 1.0438808304069443e-07, + "loss": -0.0176, + "reward": 1.1361607611179352, + "reward_std": 0.25112679600715637, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.948660746216774, + "step": 3213 + }, + { + "completion_length": 965.6094055175781, + "epoch": 0.9600477932940035, + "grad_norm": 0.9224072694778442, + "kl": 0.59130859375, + "learning_rate": 1.0432293831656773e-07, + "loss": 0.0368, + "reward": 1.0691964775323868, + "reward_std": 0.1684934850782156, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.9620535969734192, + "step": 3214 + }, + { + "completion_length": 845.5669860839844, + "epoch": 0.9603465013815249, + "grad_norm": 2.2330963611602783, + "kl": 0.57763671875, + "learning_rate": 1.0425827844540311e-07, + "loss": -0.0258, + "reward": 1.1160714626312256, + "reward_std": 0.20229348354041576, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.9620535969734192, + "step": 3215 + }, + { + "completion_length": 936.4598541259766, + "epoch": 0.9606452094690464, + "grad_norm": 1.2658644914627075, + "kl": 0.6220703125, + "learning_rate": 1.0419410349754414e-07, + "loss": 0.0329, + "reward": 1.1071429252624512, + "reward_std": 0.19217334315180779, + "rewards/accuracy_reward": 0.13392857694998384, + "rewards/format_reward": 0.973214328289032, + "step": 3216 + }, + { + "completion_length": 896.1205596923828, + "epoch": 0.9609439175565678, + "grad_norm": 1.676344871520996, + "kl": 0.88671875, + "learning_rate": 1.0413041354280689e-07, + "loss": 0.0334, + "reward": 1.0245536267757416, + "reward_std": 0.22858993895351887, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.941964328289032, + "step": 3217 + }, + { + "completion_length": 884.1987152099609, + "epoch": 0.9612426256440894, + "grad_norm": 1.3283603191375732, + "kl": 0.52197265625, + "learning_rate": 1.0406720865047981e-07, + "loss": 0.0422, + "reward": 1.162946492433548, + "reward_std": 0.26333946734666824, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9620535969734192, + "step": 3218 + }, + { + "completion_length": 919.8906555175781, + "epoch": 0.9615413337316108, + "grad_norm": 0.8937078714370728, + "kl": 0.6083984375, + "learning_rate": 1.0400448888932357e-07, + "loss": -0.0019, + "reward": 1.0267857909202576, + "reward_std": 0.22215037420392036, + "rewards/accuracy_reward": 0.08258928777649999, + "rewards/format_reward": 0.9441964626312256, + "step": 3219 + }, + { + "completion_length": 900.8638916015625, + "epoch": 0.9618400418191323, + "grad_norm": 0.4610044062137604, + "kl": 0.2861328125, + "learning_rate": 1.039422543275712e-07, + "loss": -0.0004, + "reward": 1.1562500298023224, + "reward_std": 0.15286973118782043, + "rewards/accuracy_reward": 0.17410715389996767, + "rewards/format_reward": 0.9821428954601288, + "step": 3220 + }, + { + "completion_length": 958.8951263427734, + "epoch": 0.9621387499066537, + "grad_norm": 0.8687283992767334, + "kl": 0.732421875, + "learning_rate": 1.0388050503292772e-07, + "loss": 0.0445, + "reward": 1.1205357611179352, + "reward_std": 0.22996852174401283, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.9620536118745804, + "step": 3221 + }, + { + "completion_length": 867.9018096923828, + "epoch": 0.9624374579941752, + "grad_norm": 1.6048332452774048, + "kl": 0.625, + "learning_rate": 1.0381924107257034e-07, + "loss": 0.0461, + "reward": 1.1495536267757416, + "reward_std": 0.2004067823290825, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.9553571790456772, + "step": 3222 + }, + { + "completion_length": 925.5335083007812, + "epoch": 0.9627361660816967, + "grad_norm": 0.7290322780609131, + "kl": 0.7822265625, + "learning_rate": 1.0375846251314833e-07, + "loss": 0.0464, + "reward": 1.06026791036129, + "reward_std": 0.24060756340622902, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.95089291036129, + "step": 3223 + }, + { + "completion_length": 861.3884124755859, + "epoch": 0.9630348741692182, + "grad_norm": 1.179878830909729, + "kl": 0.62158203125, + "learning_rate": 1.036981694207827e-07, + "loss": -0.009, + "reward": 1.1383929252624512, + "reward_std": 0.2294715065509081, + "rewards/accuracy_reward": 0.1852678619325161, + "rewards/format_reward": 0.9531250596046448, + "step": 3224 + }, + { + "completion_length": 888.0580749511719, + "epoch": 0.9633335822567396, + "grad_norm": 1.0077390670776367, + "kl": 0.447265625, + "learning_rate": 1.0363836186106642e-07, + "loss": 0.0198, + "reward": 1.2366071939468384, + "reward_std": 0.24424373358488083, + "rewards/accuracy_reward": 0.2589285932481289, + "rewards/format_reward": 0.9776786118745804, + "step": 3225 + }, + { + "completion_length": 992.0201416015625, + "epoch": 0.9636322903442611, + "grad_norm": 0.9505878686904907, + "kl": 0.6533203125, + "learning_rate": 1.0357903989906437e-07, + "loss": 0.0488, + "reward": 1.100446492433548, + "reward_std": 0.1596509488299489, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.9665178805589676, + "step": 3226 + }, + { + "completion_length": 942.7634582519531, + "epoch": 0.9639309984317825, + "grad_norm": 0.8254166841506958, + "kl": 0.74853515625, + "learning_rate": 1.0352020359931289e-07, + "loss": 0.0268, + "reward": 1.160714328289032, + "reward_std": 0.20536521449685097, + "rewards/accuracy_reward": 0.1897321455180645, + "rewards/format_reward": 0.9709821790456772, + "step": 3227 + }, + { + "completion_length": 912.1027374267578, + "epoch": 0.9642297065193041, + "grad_norm": 1.1008106470108032, + "kl": 0.69091796875, + "learning_rate": 1.0346185302582017e-07, + "loss": 0.0368, + "reward": 1.1696428954601288, + "reward_std": 0.2899806834757328, + "rewards/accuracy_reward": 0.2209821492433548, + "rewards/format_reward": 0.9486607611179352, + "step": 3228 + }, + { + "completion_length": 903.4018402099609, + "epoch": 0.9645284146068255, + "grad_norm": 1.4733467102050781, + "kl": 0.5439453125, + "learning_rate": 1.0340398824206595e-07, + "loss": 0.0533, + "reward": 1.1651786267757416, + "reward_std": 0.24255100265145302, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9665178954601288, + "step": 3229 + }, + { + "completion_length": 922.7210388183594, + "epoch": 0.9648271226943469, + "grad_norm": 1.2301266193389893, + "kl": 0.49853515625, + "learning_rate": 1.033466093110014e-07, + "loss": 0.0347, + "reward": 1.1651786267757416, + "reward_std": 0.281108807772398, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.9620536267757416, + "step": 3230 + }, + { + "completion_length": 890.6272735595703, + "epoch": 0.9651258307818684, + "grad_norm": 0.6961304545402527, + "kl": 0.65234375, + "learning_rate": 1.0328971629504919e-07, + "loss": 0.0329, + "reward": 1.1071429252624512, + "reward_std": 0.19156829454004765, + "rewards/accuracy_reward": 0.14508929196745157, + "rewards/format_reward": 0.9620536267757416, + "step": 3231 + }, + { + "completion_length": 935.4062957763672, + "epoch": 0.9654245388693898, + "grad_norm": 0.8352493047714233, + "kl": 0.3671875, + "learning_rate": 1.0323330925610333e-07, + "loss": -0.0153, + "reward": 1.0267857760190964, + "reward_std": 0.17367032170295715, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.9687500447034836, + "step": 3232 + }, + { + "completion_length": 885.9598693847656, + "epoch": 0.9657232469569114, + "grad_norm": 0.7703906297683716, + "kl": 0.502197265625, + "learning_rate": 1.0317738825552916e-07, + "loss": 0.0254, + "reward": 1.1116072237491608, + "reward_std": 0.22743751481175423, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.9732143431901932, + "step": 3233 + }, + { + "completion_length": 918.1830749511719, + "epoch": 0.9660219550444328, + "grad_norm": 0.8993591070175171, + "kl": 0.4931640625, + "learning_rate": 1.0312195335416322e-07, + "loss": 0.0529, + "reward": 1.2031250298023224, + "reward_std": 0.2384892962872982, + "rewards/accuracy_reward": 0.2276785783469677, + "rewards/format_reward": 0.9754464775323868, + "step": 3234 + }, + { + "completion_length": 905.7477874755859, + "epoch": 0.9663206631319543, + "grad_norm": 7.540886402130127, + "kl": 0.63818359375, + "learning_rate": 1.030670046123133e-07, + "loss": 0.052, + "reward": 1.1540178954601288, + "reward_std": 0.2585162818431854, + "rewards/accuracy_reward": 0.2008928619325161, + "rewards/format_reward": 0.9531250298023224, + "step": 3235 + }, + { + "completion_length": 903.2411041259766, + "epoch": 0.9666193712194757, + "grad_norm": 0.8812289834022522, + "kl": 0.37744140625, + "learning_rate": 1.0301254208975823e-07, + "loss": 0.0018, + "reward": 1.2366072237491608, + "reward_std": 0.2105230949819088, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.9866071790456772, + "step": 3236 + }, + { + "completion_length": 823.5446624755859, + "epoch": 0.9669180793069972, + "grad_norm": 2.377842903137207, + "kl": 0.556640625, + "learning_rate": 1.0295856584574785e-07, + "loss": 0.0295, + "reward": 1.160714328289032, + "reward_std": 0.2592652775347233, + "rewards/accuracy_reward": 0.20982144121080637, + "rewards/format_reward": 0.9508928954601288, + "step": 3237 + }, + { + "completion_length": 900.2723693847656, + "epoch": 0.9672167873945187, + "grad_norm": 0.7887007594108582, + "kl": 0.50927734375, + "learning_rate": 1.0290507593900307e-07, + "loss": 0.0306, + "reward": 1.1116071939468384, + "reward_std": 0.19612165354192257, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.9709821939468384, + "step": 3238 + }, + { + "completion_length": 896.8638763427734, + "epoch": 0.9675154954820402, + "grad_norm": 1.0533583164215088, + "kl": 0.5244140625, + "learning_rate": 1.0285207242771568e-07, + "loss": 0.0236, + "reward": 1.1540178954601288, + "reward_std": 0.22659431397914886, + "rewards/accuracy_reward": 0.18303572107106447, + "rewards/format_reward": 0.9709821790456772, + "step": 3239 + }, + { + "completion_length": 839.3281555175781, + "epoch": 0.9678142035695616, + "grad_norm": 0.954180896282196, + "kl": 0.40185546875, + "learning_rate": 1.027995553695483e-07, + "loss": 0.0184, + "reward": 1.0602678954601288, + "reward_std": 0.2317110188305378, + "rewards/accuracy_reward": 0.09375000279396772, + "rewards/format_reward": 0.9665178954601288, + "step": 3240 + }, + { + "completion_length": 937.0982513427734, + "epoch": 0.9681129116570831, + "grad_norm": 1.0583133697509766, + "kl": 0.4921875, + "learning_rate": 1.0274752482163426e-07, + "loss": 0.0038, + "reward": 1.1875000596046448, + "reward_std": 0.21516264975070953, + "rewards/accuracy_reward": 0.20758928917348385, + "rewards/format_reward": 0.979910746216774, + "step": 3241 + }, + { + "completion_length": 999.2121124267578, + "epoch": 0.9684116197446045, + "grad_norm": 1.1899257898330688, + "kl": 0.56298828125, + "learning_rate": 1.0269598084057783e-07, + "loss": 0.0306, + "reward": 1.1093750596046448, + "reward_std": 0.18567607179284096, + "rewards/accuracy_reward": 0.1450892873108387, + "rewards/format_reward": 0.9642857611179352, + "step": 3242 + }, + { + "completion_length": 944.1942443847656, + "epoch": 0.9687103278321261, + "grad_norm": 1.2918986082077026, + "kl": 0.55712890625, + "learning_rate": 1.0264492348245369e-07, + "loss": 0.0044, + "reward": 1.147321492433548, + "reward_std": 0.2123798504471779, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9709821939468384, + "step": 3243 + }, + { + "completion_length": 1015.5245819091797, + "epoch": 0.9690090359196475, + "grad_norm": 0.7551630735397339, + "kl": 0.51171875, + "learning_rate": 1.0259435280280732e-07, + "loss": 0.0122, + "reward": 1.1517857611179352, + "reward_std": 0.1645408496260643, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.9776786118745804, + "step": 3244 + }, + { + "completion_length": 986.6004943847656, + "epoch": 0.969307744007169, + "grad_norm": 1.1059401035308838, + "kl": 0.52685546875, + "learning_rate": 1.0254426885665462e-07, + "loss": -0.012, + "reward": 1.0781250149011612, + "reward_std": 0.24247264862060547, + "rewards/accuracy_reward": 0.12053571920841932, + "rewards/format_reward": 0.957589328289032, + "step": 3245 + }, + { + "completion_length": 934.2388763427734, + "epoch": 0.9696064520946904, + "grad_norm": 1.2273973226547241, + "kl": 0.51220703125, + "learning_rate": 1.0249467169848205e-07, + "loss": 0.0181, + "reward": 1.0915178954601288, + "reward_std": 0.22234264016151428, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 3246 + }, + { + "completion_length": 944.779052734375, + "epoch": 0.969905160182212, + "grad_norm": 1.05387544631958, + "kl": 0.4091796875, + "learning_rate": 1.0244556138224637e-07, + "loss": 0.004, + "reward": 1.098214328289032, + "reward_std": 0.23668359965085983, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.9464286118745804, + "step": 3247 + }, + { + "completion_length": 929.654052734375, + "epoch": 0.9702038682697334, + "grad_norm": 2.6562111377716064, + "kl": 0.57958984375, + "learning_rate": 1.0239693796137493e-07, + "loss": 0.0394, + "reward": 1.0758928954601288, + "reward_std": 0.2306467741727829, + "rewards/accuracy_reward": 0.12500000977888703, + "rewards/format_reward": 0.9508928954601288, + "step": 3248 + }, + { + "completion_length": 876.2879791259766, + "epoch": 0.9705025763572549, + "grad_norm": 0.9892987012863159, + "kl": 0.61865234375, + "learning_rate": 1.0234880148876515e-07, + "loss": 0.0056, + "reward": 1.069196492433548, + "reward_std": 0.19677217490971088, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.973214328289032, + "step": 3249 + }, + { + "completion_length": 914.5826110839844, + "epoch": 0.9708012844447763, + "grad_norm": 1.0476782321929932, + "kl": 0.62939453125, + "learning_rate": 1.023011520167848e-07, + "loss": 0.0244, + "reward": 1.0714286118745804, + "reward_std": 0.2955422103404999, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.9531250447034836, + "step": 3250 + }, + { + "completion_length": 927.1540679931641, + "epoch": 0.9710999925322978, + "grad_norm": 1.3065589666366577, + "kl": 0.3974609375, + "learning_rate": 1.0225398959727186e-07, + "loss": 0.0222, + "reward": 1.1562500298023224, + "reward_std": 0.21957174688577652, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9575893431901932, + "step": 3251 + }, + { + "completion_length": 826.3951263427734, + "epoch": 0.9713987006198193, + "grad_norm": 1.4580141305923462, + "kl": 0.63623046875, + "learning_rate": 1.0220731428153443e-07, + "loss": 0.0447, + "reward": 1.095982164144516, + "reward_std": 0.2370159775018692, + "rewards/accuracy_reward": 0.15178572130389512, + "rewards/format_reward": 0.9441964477300644, + "step": 3252 + }, + { + "completion_length": 859.9576416015625, + "epoch": 0.9716974087073408, + "grad_norm": 1.734487533569336, + "kl": 0.4912109375, + "learning_rate": 1.0216112612035063e-07, + "loss": 0.034, + "reward": 1.2366072237491608, + "reward_std": 0.3050815314054489, + "rewards/accuracy_reward": 0.279017873108387, + "rewards/format_reward": 0.957589328289032, + "step": 3253 + }, + { + "completion_length": 833.7053833007812, + "epoch": 0.9719961167948622, + "grad_norm": 1.565192699432373, + "kl": 0.5830078125, + "learning_rate": 1.0211542516396875e-07, + "loss": 0.0349, + "reward": 1.1718750447034836, + "reward_std": 0.23665438406169415, + "rewards/accuracy_reward": 0.2053571492433548, + "rewards/format_reward": 0.9665178954601288, + "step": 3254 + }, + { + "completion_length": 903.9397583007812, + "epoch": 0.9722948248823837, + "grad_norm": 2.3882362842559814, + "kl": 0.64990234375, + "learning_rate": 1.020702114621068e-07, + "loss": 0.0005, + "reward": 1.145089328289032, + "reward_std": 0.28534556180238724, + "rewards/accuracy_reward": 0.1897321455180645, + "rewards/format_reward": 0.9553571790456772, + "step": 3255 + }, + { + "completion_length": 1004.2143096923828, + "epoch": 0.9725935329699051, + "grad_norm": 1.543635606765747, + "kl": 0.90625, + "learning_rate": 1.0202548506395297e-07, + "loss": 0.0367, + "reward": 1.0714286267757416, + "reward_std": 0.26967835798859596, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.9397321790456772, + "step": 3256 + }, + { + "completion_length": 903.9665679931641, + "epoch": 0.9728922410574267, + "grad_norm": 1.1968238353729248, + "kl": 0.595703125, + "learning_rate": 1.0198124601816523e-07, + "loss": 0.0422, + "reward": 1.1428571939468384, + "reward_std": 0.2613285444676876, + "rewards/accuracy_reward": 0.18080358393490314, + "rewards/format_reward": 0.9620536118745804, + "step": 3257 + }, + { + "completion_length": 936.9486846923828, + "epoch": 0.9731909491449481, + "grad_norm": 1.2861242294311523, + "kl": 0.76171875, + "learning_rate": 1.019374943728712e-07, + "loss": 0.0235, + "reward": 1.0870536416769028, + "reward_std": 0.2163458615541458, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.9620536118745804, + "step": 3258 + }, + { + "completion_length": 895.169677734375, + "epoch": 0.9734896572324696, + "grad_norm": 1.503171682357788, + "kl": 0.64501953125, + "learning_rate": 1.0189423017566845e-07, + "loss": 0.0126, + "reward": 1.0647321939468384, + "reward_std": 0.1781274378299713, + "rewards/accuracy_reward": 0.09375000651925802, + "rewards/format_reward": 0.9709821939468384, + "step": 3259 + }, + { + "completion_length": 865.0870971679688, + "epoch": 0.973788365319991, + "grad_norm": 1.2285012006759644, + "kl": 0.76904296875, + "learning_rate": 1.0185145347362418e-07, + "loss": -0.01, + "reward": 1.1674107611179352, + "reward_std": 0.24123255535960197, + "rewards/accuracy_reward": 0.21205358393490314, + "rewards/format_reward": 0.9553571790456772, + "step": 3260 + }, + { + "completion_length": 938.7165679931641, + "epoch": 0.9740870734075125, + "grad_norm": 1.7494008541107178, + "kl": 0.8681640625, + "learning_rate": 1.018091643132753e-07, + "loss": 0.0224, + "reward": 1.1406250596046448, + "reward_std": 0.20874984189867973, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.9687500298023224, + "step": 3261 + }, + { + "completion_length": 894.0736999511719, + "epoch": 0.974385781495034, + "grad_norm": 1.889276146888733, + "kl": 0.865234375, + "learning_rate": 1.0176736274062817e-07, + "loss": 0.0161, + "reward": 1.1272321939468384, + "reward_std": 0.27518952265381813, + "rewards/accuracy_reward": 0.17857143469154835, + "rewards/format_reward": 0.948660746216774, + "step": 3262 + }, + { + "completion_length": 842.9263610839844, + "epoch": 0.9746844895825555, + "grad_norm": 1.5008015632629395, + "kl": 1.111328125, + "learning_rate": 1.0172604880115888e-07, + "loss": 0.0306, + "reward": 1.084821492433548, + "reward_std": 0.2721674479544163, + "rewards/accuracy_reward": 0.13169643934816122, + "rewards/format_reward": 0.9531250447034836, + "step": 3263 + }, + { + "completion_length": 880.6808471679688, + "epoch": 0.9749831976700769, + "grad_norm": 1.040971040725708, + "kl": 0.794921875, + "learning_rate": 1.0168522253981293e-07, + "loss": 0.0876, + "reward": 1.1406250298023224, + "reward_std": 0.2201441526412964, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.9642857760190964, + "step": 3264 + }, + { + "completion_length": 832.9598693847656, + "epoch": 0.9752819057575984, + "grad_norm": 1.4013088941574097, + "kl": 0.703125, + "learning_rate": 1.0164488400100528e-07, + "loss": 0.017, + "reward": 1.1629465222358704, + "reward_std": 0.2019926719367504, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.9642857760190964, + "step": 3265 + }, + { + "completion_length": 916.2902221679688, + "epoch": 0.9755806138451198, + "grad_norm": 2.2222375869750977, + "kl": 0.8232421875, + "learning_rate": 1.0160503322862032e-07, + "loss": 0.0093, + "reward": 1.116071492433548, + "reward_std": 0.21179485507309437, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.9665178954601288, + "step": 3266 + }, + { + "completion_length": 869.5201263427734, + "epoch": 0.9758793219326414, + "grad_norm": 0.8466964364051819, + "kl": 0.55224609375, + "learning_rate": 1.0156567026601176e-07, + "loss": 0.0257, + "reward": 1.147321492433548, + "reward_std": 0.22428465262055397, + "rewards/accuracy_reward": 0.18750000186264515, + "rewards/format_reward": 0.9598214775323868, + "step": 3267 + }, + { + "completion_length": 869.6942291259766, + "epoch": 0.9761780300201628, + "grad_norm": 1.4995827674865723, + "kl": 0.6845703125, + "learning_rate": 1.015267951560027e-07, + "loss": 0.0446, + "reward": 1.0156250298023224, + "reward_std": 0.22631410136818886, + "rewards/accuracy_reward": 0.06696428754366934, + "rewards/format_reward": 0.948660746216774, + "step": 3268 + }, + { + "completion_length": 902.2076263427734, + "epoch": 0.9764767381076843, + "grad_norm": 1.4483710527420044, + "kl": 0.603515625, + "learning_rate": 1.0148840794088538e-07, + "loss": 0.0167, + "reward": 1.2031250298023224, + "reward_std": 0.23960281535983086, + "rewards/accuracy_reward": 0.23437500977888703, + "rewards/format_reward": 0.9687500298023224, + "step": 3269 + }, + { + "completion_length": 889.0960235595703, + "epoch": 0.9767754461952057, + "grad_norm": 1.0519722700119019, + "kl": 0.7236328125, + "learning_rate": 1.0145050866242139e-07, + "loss": 0.0468, + "reward": 1.1517857909202576, + "reward_std": 0.20608560368418694, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.9553571790456772, + "step": 3270 + }, + { + "completion_length": 924.4911193847656, + "epoch": 0.9770741542827273, + "grad_norm": 1.0309644937515259, + "kl": 0.7353515625, + "learning_rate": 1.0141309736184135e-07, + "loss": 0.0423, + "reward": 1.1495536267757416, + "reward_std": 0.2282738760113716, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.9508928954601288, + "step": 3271 + }, + { + "completion_length": 910.7545013427734, + "epoch": 0.9773728623702487, + "grad_norm": 1.2426059246063232, + "kl": 0.64599609375, + "learning_rate": 1.0137617407984517e-07, + "loss": 0.0478, + "reward": 1.1049107611179352, + "reward_std": 0.24058742448687553, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.9531250447034836, + "step": 3272 + }, + { + "completion_length": 915.4219207763672, + "epoch": 0.9776715704577701, + "grad_norm": 0.9347116947174072, + "kl": 0.64501953125, + "learning_rate": 1.0133973885660173e-07, + "loss": 0.0098, + "reward": 1.1049107611179352, + "reward_std": 0.22264081612229347, + "rewards/accuracy_reward": 0.13392857694998384, + "rewards/format_reward": 0.9709821790456772, + "step": 3273 + }, + { + "completion_length": 911.4486999511719, + "epoch": 0.9779702785452916, + "grad_norm": 1.420290470123291, + "kl": 0.578125, + "learning_rate": 1.0130379173174901e-07, + "loss": -0.0085, + "reward": 1.1651785969734192, + "reward_std": 0.28390755876898766, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.9620536118745804, + "step": 3274 + }, + { + "completion_length": 875.1094207763672, + "epoch": 0.978268986632813, + "grad_norm": 2.1972293853759766, + "kl": 0.630859375, + "learning_rate": 1.0126833274439385e-07, + "loss": -0.001, + "reward": 1.1093750447034836, + "reward_std": 0.3272711858153343, + "rewards/accuracy_reward": 0.18750001303851604, + "rewards/format_reward": 0.9218750447034836, + "step": 3275 + }, + { + "completion_length": 984.1942291259766, + "epoch": 0.9785676947203346, + "grad_norm": 0.8984198570251465, + "kl": 0.7080078125, + "learning_rate": 1.0123336193311232e-07, + "loss": -0.0109, + "reward": 1.0848215222358704, + "reward_std": 0.22396796941757202, + "rewards/accuracy_reward": 0.12500000232830644, + "rewards/format_reward": 0.9598214626312256, + "step": 3276 + }, + { + "completion_length": 924.9844207763672, + "epoch": 0.978866402807856, + "grad_norm": 1.363180160522461, + "kl": 0.90234375, + "learning_rate": 1.0119887933594911e-07, + "loss": -0.0004, + "reward": 1.1875000298023224, + "reward_std": 0.2951166331768036, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.9486607611179352, + "step": 3277 + }, + { + "completion_length": 973.9799499511719, + "epoch": 0.9791651108953775, + "grad_norm": 1.0389126539230347, + "kl": 0.603515625, + "learning_rate": 1.0116488499041794e-07, + "loss": 0.0031, + "reward": 1.1651786118745804, + "reward_std": 0.2821730524301529, + "rewards/accuracy_reward": 0.20758929662406445, + "rewards/format_reward": 0.957589328289032, + "step": 3278 + }, + { + "completion_length": 929.7545166015625, + "epoch": 0.9794638189828989, + "grad_norm": 1.2162200212478638, + "kl": 0.6005859375, + "learning_rate": 1.0113137893350135e-07, + "loss": 0.0469, + "reward": 1.0959821790456772, + "reward_std": 0.2331821359694004, + "rewards/accuracy_reward": 0.14062500838190317, + "rewards/format_reward": 0.9553571939468384, + "step": 3279 + }, + { + "completion_length": 917.8549499511719, + "epoch": 0.9797625270704204, + "grad_norm": 1.6201364994049072, + "kl": 0.708984375, + "learning_rate": 1.0109836120165059e-07, + "loss": 0.0402, + "reward": 1.087053582072258, + "reward_std": 0.22542335093021393, + "rewards/accuracy_reward": 0.13169643515720963, + "rewards/format_reward": 0.9553571790456772, + "step": 3280 + }, + { + "completion_length": 870.9062805175781, + "epoch": 0.9800612351579419, + "grad_norm": 1.1589266061782837, + "kl": 0.900390625, + "learning_rate": 1.0106583183078579e-07, + "loss": 0.0268, + "reward": 1.0491072237491608, + "reward_std": 0.24043823778629303, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.9598214775323868, + "step": 3281 + }, + { + "completion_length": 820.200927734375, + "epoch": 0.9803599432454634, + "grad_norm": 1.8999704122543335, + "kl": 0.6630859375, + "learning_rate": 1.0103379085629569e-07, + "loss": 0.0167, + "reward": 1.098214328289032, + "reward_std": 0.18654434382915497, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.970982164144516, + "step": 3282 + }, + { + "completion_length": 863.4888763427734, + "epoch": 0.9806586513329848, + "grad_norm": 1.4687258005142212, + "kl": 0.765625, + "learning_rate": 1.0100223831303767e-07, + "loss": 0.0321, + "reward": 1.1718750298023224, + "reward_std": 0.2081343401223421, + "rewards/accuracy_reward": 0.2165178693830967, + "rewards/format_reward": 0.9553571790456772, + "step": 3283 + }, + { + "completion_length": 835.0826263427734, + "epoch": 0.9809573594205063, + "grad_norm": 1.4355425834655762, + "kl": 1.0458984375, + "learning_rate": 1.0097117423533792e-07, + "loss": 0.006, + "reward": 1.0401786416769028, + "reward_std": 0.2726965695619583, + "rewards/accuracy_reward": 0.08928571688011289, + "rewards/format_reward": 0.9508928954601288, + "step": 3284 + }, + { + "completion_length": 898.982177734375, + "epoch": 0.9812560675080277, + "grad_norm": 2.007012128829956, + "kl": 0.908203125, + "learning_rate": 1.00940598656991e-07, + "loss": 0.0492, + "reward": 1.1138393431901932, + "reward_std": 0.26783934235572815, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9508928954601288, + "step": 3285 + }, + { + "completion_length": 923.4420013427734, + "epoch": 0.9815547755955493, + "grad_norm": 1.262165904045105, + "kl": 0.908203125, + "learning_rate": 1.0091051161126022e-07, + "loss": 0.0368, + "reward": 1.098214328289032, + "reward_std": 0.2608444169163704, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.957589328289032, + "step": 3286 + }, + { + "completion_length": 876.8839721679688, + "epoch": 0.9818534836830707, + "grad_norm": 0.8919641375541687, + "kl": 0.681640625, + "learning_rate": 1.0088091313087727e-07, + "loss": 0.0093, + "reward": 1.1183035969734192, + "reward_std": 0.21731337904930115, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.9575893133878708, + "step": 3287 + }, + { + "completion_length": 913.7924346923828, + "epoch": 0.9821521917705922, + "grad_norm": 1.1792478561401367, + "kl": 0.69921875, + "learning_rate": 1.0085180324804246e-07, + "loss": 0.0046, + "reward": 1.1830357611179352, + "reward_std": 0.3003153130412102, + "rewards/accuracy_reward": 0.227678582072258, + "rewards/format_reward": 0.9553571790456772, + "step": 3288 + }, + { + "completion_length": 932.2187957763672, + "epoch": 0.9824508998581136, + "grad_norm": 2.0900650024414062, + "kl": 0.8251953125, + "learning_rate": 1.0082318199442449e-07, + "loss": 0.0222, + "reward": 1.0691964626312256, + "reward_std": 0.24868462979793549, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.9397321939468384, + "step": 3289 + }, + { + "completion_length": 863.9821929931641, + "epoch": 0.9827496079456352, + "grad_norm": 1.848851203918457, + "kl": 0.599609375, + "learning_rate": 1.0079504940116038e-07, + "loss": 0.0634, + "reward": 1.129464328289032, + "reward_std": 0.21062462776899338, + "rewards/accuracy_reward": 0.15848215017467737, + "rewards/format_reward": 0.9709821790456772, + "step": 3290 + }, + { + "completion_length": 973.6607513427734, + "epoch": 0.9830483160331566, + "grad_norm": 1.3911136388778687, + "kl": 0.779296875, + "learning_rate": 1.0076740549885572e-07, + "loss": 0.0226, + "reward": 1.1584822237491608, + "reward_std": 0.21751123294234276, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.9642857760190964, + "step": 3291 + }, + { + "completion_length": 863.5803833007812, + "epoch": 0.9833470241206781, + "grad_norm": 1.4823641777038574, + "kl": 0.7509765625, + "learning_rate": 1.0074025031758441e-07, + "loss": 0.0257, + "reward": 1.1696428954601288, + "reward_std": 0.236037690192461, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.9665178954601288, + "step": 3292 + }, + { + "completion_length": 858.2701263427734, + "epoch": 0.9836457322081995, + "grad_norm": 1.8677241802215576, + "kl": 0.9765625, + "learning_rate": 1.0071358388688851e-07, + "loss": 0.0397, + "reward": 1.1517857611179352, + "reward_std": 0.25402168929576874, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.957589328289032, + "step": 3293 + }, + { + "completion_length": 958.4330902099609, + "epoch": 0.983944440295721, + "grad_norm": 0.8846431970596313, + "kl": 0.9970703125, + "learning_rate": 1.0068740623577857e-07, + "loss": 0.0515, + "reward": 1.1183035969734192, + "reward_std": 0.27098214626312256, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.9575893431901932, + "step": 3294 + }, + { + "completion_length": 933.6518249511719, + "epoch": 0.9842431483832424, + "grad_norm": 2.1618294715881348, + "kl": 0.91015625, + "learning_rate": 1.0066171739273326e-07, + "loss": 0.0557, + "reward": 1.084821492433548, + "reward_std": 0.2793995290994644, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.9531250298023224, + "step": 3295 + }, + { + "completion_length": 1004.1049499511719, + "epoch": 0.984541856470764, + "grad_norm": 1.6127601861953735, + "kl": 0.896484375, + "learning_rate": 1.0063651738569956e-07, + "loss": 0.0288, + "reward": 1.0357143580913544, + "reward_std": 0.1603496791794896, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.9620536118745804, + "step": 3296 + }, + { + "completion_length": 926.8237152099609, + "epoch": 0.9848405645582854, + "grad_norm": 2.214730978012085, + "kl": 1.2685546875, + "learning_rate": 1.0061180624209255e-07, + "loss": 0.077, + "reward": 1.1227679252624512, + "reward_std": 0.2618231698870659, + "rewards/accuracy_reward": 0.17410715483129025, + "rewards/format_reward": 0.9486607611179352, + "step": 3297 + }, + { + "completion_length": 918.2031707763672, + "epoch": 0.9851392726458069, + "grad_norm": 1.9936867952346802, + "kl": 0.9365234375, + "learning_rate": 1.0058758398879562e-07, + "loss": 0.0265, + "reward": 1.04464291036129, + "reward_std": 0.2099439511075616, + "rewards/accuracy_reward": 0.08928571944124997, + "rewards/format_reward": 0.9553571790456772, + "step": 3298 + }, + { + "completion_length": 917.5089721679688, + "epoch": 0.9854379807333283, + "grad_norm": 1.4279828071594238, + "kl": 0.9482421875, + "learning_rate": 1.0056385065216011e-07, + "loss": 0.0339, + "reward": 1.0312500447034836, + "reward_std": 0.19470332190394402, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.9486607611179352, + "step": 3299 + }, + { + "completion_length": 940.8861999511719, + "epoch": 0.9857366888208499, + "grad_norm": 1.7152913808822632, + "kl": 0.72802734375, + "learning_rate": 1.0054060625800564e-07, + "loss": 0.0514, + "reward": 1.1026786267757416, + "reward_std": 0.26164938509464264, + "rewards/accuracy_reward": 0.14062500558793545, + "rewards/format_reward": 0.9620536118745804, + "step": 3300 + }, + { + "completion_length": 843.1607666015625, + "epoch": 0.9860353969083713, + "grad_norm": 1.9350619316101074, + "kl": 0.65380859375, + "learning_rate": 1.0051785083161985e-07, + "loss": 0.0499, + "reward": 1.160714328289032, + "reward_std": 0.21855534613132477, + "rewards/accuracy_reward": 0.2075892984867096, + "rewards/format_reward": 0.9531250447034836, + "step": 3301 + }, + { + "completion_length": 967.7433471679688, + "epoch": 0.9863341049958928, + "grad_norm": 1.4570785760879517, + "kl": 0.830078125, + "learning_rate": 1.0049558439775828e-07, + "loss": 0.0728, + "reward": 1.035714328289032, + "reward_std": 0.24597486853599548, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.941964328289032, + "step": 3302 + }, + { + "completion_length": 925.0245971679688, + "epoch": 0.9866328130834142, + "grad_norm": 1.3381036520004272, + "kl": 0.681640625, + "learning_rate": 1.0047380698064481e-07, + "loss": 0.0429, + "reward": 1.1205357611179352, + "reward_std": 0.21419230103492737, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.9642857313156128, + "step": 3303 + }, + { + "completion_length": 899.2210083007812, + "epoch": 0.9869315211709357, + "grad_norm": 2.2973763942718506, + "kl": 0.794921875, + "learning_rate": 1.0045251860397098e-07, + "loss": 0.055, + "reward": 1.1383928954601288, + "reward_std": 0.24360055848956108, + "rewards/accuracy_reward": 0.1830357275903225, + "rewards/format_reward": 0.9553571939468384, + "step": 3304 + }, + { + "completion_length": 966.8192596435547, + "epoch": 0.9872302292584572, + "grad_norm": 2.80193829536438, + "kl": 0.9267578125, + "learning_rate": 1.0043171929089653e-07, + "loss": 0.0764, + "reward": 1.084821492433548, + "reward_std": 0.2936510406434536, + "rewards/accuracy_reward": 0.14732143748551607, + "rewards/format_reward": 0.9375000596046448, + "step": 3305 + }, + { + "completion_length": 1032.745590209961, + "epoch": 0.9875289373459787, + "grad_norm": 1.0050294399261475, + "kl": 0.8408203125, + "learning_rate": 1.0041140906404907e-07, + "loss": 0.058, + "reward": 1.0580357909202576, + "reward_std": 0.21384082362055779, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.957589328289032, + "step": 3306 + }, + { + "completion_length": 910.9241333007812, + "epoch": 0.9878276454335001, + "grad_norm": 0.8039256930351257, + "kl": 0.72265625, + "learning_rate": 1.0039158794552413e-07, + "loss": 0.0224, + "reward": 1.1674107611179352, + "reward_std": 0.27821463719010353, + "rewards/accuracy_reward": 0.2031250037252903, + "rewards/format_reward": 0.964285746216774, + "step": 3307 + }, + { + "completion_length": 914.2031402587891, + "epoch": 0.9881263535210216, + "grad_norm": 1.1765540838241577, + "kl": 0.8134765625, + "learning_rate": 1.0037225595688517e-07, + "loss": 0.0245, + "reward": 1.1383929252624512, + "reward_std": 0.25923679769039154, + "rewards/accuracy_reward": 0.18526786658912897, + "rewards/format_reward": 0.9531250596046448, + "step": 3308 + }, + { + "completion_length": 851.1607513427734, + "epoch": 0.988425061608543, + "grad_norm": 1.077142596244812, + "kl": 0.958984375, + "learning_rate": 1.0035341311916344e-07, + "loss": 0.0553, + "reward": 1.1651786267757416, + "reward_std": 0.24280064180493355, + "rewards/accuracy_reward": 0.212053582072258, + "rewards/format_reward": 0.9531250298023224, + "step": 3309 + }, + { + "completion_length": 866.341552734375, + "epoch": 0.9887237696960646, + "grad_norm": 3.3510420322418213, + "kl": 0.8779296875, + "learning_rate": 1.0033505945285818e-07, + "loss": 0.0709, + "reward": 1.1785714626312256, + "reward_std": 0.32663195952773094, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.9486607611179352, + "step": 3310 + }, + { + "completion_length": 872.2254943847656, + "epoch": 0.989022477783586, + "grad_norm": 1.1308916807174683, + "kl": 0.9326171875, + "learning_rate": 1.0031719497793628e-07, + "loss": 0.0211, + "reward": 1.104910746216774, + "reward_std": 0.23407945036888123, + "rewards/accuracy_reward": 0.14285714784637094, + "rewards/format_reward": 0.9620536118745804, + "step": 3311 + }, + { + "completion_length": 962.6116485595703, + "epoch": 0.9893211858711075, + "grad_norm": 3.033672571182251, + "kl": 1.0576171875, + "learning_rate": 1.0029981971383263e-07, + "loss": 0.0347, + "reward": 0.9709821939468384, + "reward_std": 0.1883972194045782, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.9553571939468384, + "step": 3312 + }, + { + "completion_length": 973.8036041259766, + "epoch": 0.9896198939586289, + "grad_norm": 1.3029495477676392, + "kl": 0.5947265625, + "learning_rate": 1.0028293367944976e-07, + "loss": 0.0182, + "reward": 1.1183035969734192, + "reward_std": 0.23179568350315094, + "rewards/accuracy_reward": 0.16294643399305642, + "rewards/format_reward": 0.9553571790456772, + "step": 3313 + }, + { + "completion_length": 914.1830749511719, + "epoch": 0.9899186020461505, + "grad_norm": 3.1229496002197266, + "kl": 0.9130859375, + "learning_rate": 1.0026653689315804e-07, + "loss": 0.023, + "reward": 1.113839328289032, + "reward_std": 0.23415380716323853, + "rewards/accuracy_reward": 0.15401786379516125, + "rewards/format_reward": 0.9598214626312256, + "step": 3314 + }, + { + "completion_length": 1022.8437805175781, + "epoch": 0.9902173101336719, + "grad_norm": 1.1304656267166138, + "kl": 0.91015625, + "learning_rate": 1.0025062937279558e-07, + "loss": 0.0468, + "reward": 1.0602678805589676, + "reward_std": 0.20225585624575615, + "rewards/accuracy_reward": 0.09821429033763707, + "rewards/format_reward": 0.9620536118745804, + "step": 3315 + }, + { + "completion_length": 968.1139068603516, + "epoch": 0.9905160182211933, + "grad_norm": 1.7561370134353638, + "kl": 1.01953125, + "learning_rate": 1.0023521113566814e-07, + "loss": 0.0357, + "reward": 1.1183036267757416, + "reward_std": 0.24862359464168549, + "rewards/accuracy_reward": 0.16071429662406445, + "rewards/format_reward": 0.957589328289032, + "step": 3316 + }, + { + "completion_length": 938.5536193847656, + "epoch": 0.9908147263087148, + "grad_norm": 2.8989968299865723, + "kl": 0.77001953125, + "learning_rate": 1.0022028219854932e-07, + "loss": 0.0195, + "reward": 1.0803571939468384, + "reward_std": 0.19003304839134216, + "rewards/accuracy_reward": 0.11383928917348385, + "rewards/format_reward": 0.9665178954601288, + "step": 3317 + }, + { + "completion_length": 953.7143096923828, + "epoch": 0.9911134343962362, + "grad_norm": 1.1834537982940674, + "kl": 0.7158203125, + "learning_rate": 1.0020584257768032e-07, + "loss": 0.0131, + "reward": 1.082589328289032, + "reward_std": 0.24152741208672523, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.9486607611179352, + "step": 3318 + }, + { + "completion_length": 827.8527221679688, + "epoch": 0.9914121424837578, + "grad_norm": 1.9667497873306274, + "kl": 1.044921875, + "learning_rate": 1.0019189228877002e-07, + "loss": 0.0168, + "reward": 1.0513393431901932, + "reward_std": 0.210746668279171, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.9508928805589676, + "step": 3319 + }, + { + "completion_length": 860.8928985595703, + "epoch": 0.9917108505712792, + "grad_norm": 1.5470824241638184, + "kl": 0.8056640625, + "learning_rate": 1.00178431346995e-07, + "loss": 0.0188, + "reward": 1.174107164144516, + "reward_std": 0.24732068181037903, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.9598214626312256, + "step": 3320 + }, + { + "completion_length": 899.1004943847656, + "epoch": 0.9920095586588007, + "grad_norm": 1.0308464765548706, + "kl": 0.8525390625, + "learning_rate": 1.001654597669994e-07, + "loss": 0.0588, + "reward": 1.2165178954601288, + "reward_std": 0.24470793083310127, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.9776786118745804, + "step": 3321 + }, + { + "completion_length": 898.6228179931641, + "epoch": 0.9923082667463221, + "grad_norm": 0.8637545704841614, + "kl": 0.75244140625, + "learning_rate": 1.0015297756289508e-07, + "loss": 0.0256, + "reward": 1.1406250596046448, + "reward_std": 0.2339707538485527, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9687500298023224, + "step": 3322 + }, + { + "completion_length": 881.4978179931641, + "epoch": 0.9926069748338436, + "grad_norm": 1.1802935600280762, + "kl": 0.759765625, + "learning_rate": 1.001409847482614e-07, + "loss": 0.0497, + "reward": 1.2053571939468384, + "reward_std": 0.31999166682362556, + "rewards/accuracy_reward": 0.26339287124574184, + "rewards/format_reward": 0.941964328289032, + "step": 3323 + }, + { + "completion_length": 920.7031860351562, + "epoch": 0.992905682921365, + "grad_norm": 2.432866334915161, + "kl": 0.8779296875, + "learning_rate": 1.0012948133614543e-07, + "loss": -0.0354, + "reward": 1.0758929252624512, + "reward_std": 0.29605384171009064, + "rewards/accuracy_reward": 0.12053572200238705, + "rewards/format_reward": 0.9553571790456772, + "step": 3324 + }, + { + "completion_length": 867.5915679931641, + "epoch": 0.9932043910088866, + "grad_norm": 1.3517725467681885, + "kl": 0.6748046875, + "learning_rate": 1.0011846733906167e-07, + "loss": 0.0227, + "reward": 1.1383929252624512, + "reward_std": 0.261090699583292, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.9665178954601288, + "step": 3325 + }, + { + "completion_length": 971.1295013427734, + "epoch": 0.993503099096408, + "grad_norm": 1.1124998331069946, + "kl": 0.6708984375, + "learning_rate": 1.0010794276899233e-07, + "loss": 0.0468, + "reward": 1.1160714775323868, + "reward_std": 0.20089998468756676, + "rewards/accuracy_reward": 0.16071428824216127, + "rewards/format_reward": 0.9553571790456772, + "step": 3326 + }, + { + "completion_length": 925.0603179931641, + "epoch": 0.9938018071839295, + "grad_norm": 1.228258490562439, + "kl": 0.8603515625, + "learning_rate": 1.0009790763738709e-07, + "loss": 0.0174, + "reward": 1.0758929252624512, + "reward_std": 0.2824285365641117, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.9375000298023224, + "step": 3327 + }, + { + "completion_length": 805.8527221679688, + "epoch": 0.9941005152714509, + "grad_norm": 1.515575647354126, + "kl": 0.92578125, + "learning_rate": 1.0008836195516322e-07, + "loss": 0.0683, + "reward": 1.098214328289032, + "reward_std": 0.20710909739136696, + "rewards/accuracy_reward": 0.12276786426082253, + "rewards/format_reward": 0.9754464626312256, + "step": 3328 + }, + { + "completion_length": 925.2835083007812, + "epoch": 0.9943992233589725, + "grad_norm": 0.91594398021698, + "kl": 0.8583984375, + "learning_rate": 1.0007930573270547e-07, + "loss": 0.0321, + "reward": 1.1540178954601288, + "reward_std": 0.2938239648938179, + "rewards/accuracy_reward": 0.19866072200238705, + "rewards/format_reward": 0.9553571939468384, + "step": 3329 + }, + { + "completion_length": 957.0648040771484, + "epoch": 0.9946979314464939, + "grad_norm": 1.0758204460144043, + "kl": 0.853515625, + "learning_rate": 1.0007073897986607e-07, + "loss": 0.0431, + "reward": 1.0848214626312256, + "reward_std": 0.2706182934343815, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.95089291036129, + "step": 3330 + }, + { + "completion_length": 955.7344055175781, + "epoch": 0.9949966395340154, + "grad_norm": 1.9241985082626343, + "kl": 0.73583984375, + "learning_rate": 1.0006266170596488e-07, + "loss": 0.045, + "reward": 1.1830357909202576, + "reward_std": 0.2614474445581436, + "rewards/accuracy_reward": 0.2232143022119999, + "rewards/format_reward": 0.959821492433548, + "step": 3331 + }, + { + "completion_length": 967.1786346435547, + "epoch": 0.9952953476215368, + "grad_norm": 0.9702433347702026, + "kl": 0.857421875, + "learning_rate": 1.0005507391978915e-07, + "loss": 0.0355, + "reward": 1.1294643580913544, + "reward_std": 0.27883657440543175, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.957589328289032, + "step": 3332 + }, + { + "completion_length": 1000.3795166015625, + "epoch": 0.9955940557090583, + "grad_norm": 0.7381289005279541, + "kl": 0.70703125, + "learning_rate": 1.0004797562959367e-07, + "loss": 0.0252, + "reward": 1.0803571790456772, + "reward_std": 0.16245376504957676, + "rewards/accuracy_reward": 0.11383928847499192, + "rewards/format_reward": 0.9665178954601288, + "step": 3333 + }, + { + "completion_length": 891.5089721679688, + "epoch": 0.9958927637965798, + "grad_norm": 1.1990212202072144, + "kl": 0.748046875, + "learning_rate": 1.0004136684310066e-07, + "loss": 0.0128, + "reward": 1.1250000298023224, + "reward_std": 0.17231981828808784, + "rewards/accuracy_reward": 0.14508929220028222, + "rewards/format_reward": 0.979910746216774, + "step": 3334 + }, + { + "completion_length": 897.7031707763672, + "epoch": 0.9961914718841013, + "grad_norm": 1.0663059949874878, + "kl": 0.6748046875, + "learning_rate": 1.0003524756749982e-07, + "loss": 0.0285, + "reward": 1.129464328289032, + "reward_std": 0.2620139941573143, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.9642857611179352, + "step": 3335 + }, + { + "completion_length": 973.6273040771484, + "epoch": 0.9964901799716227, + "grad_norm": 1.2286643981933594, + "kl": 0.8173828125, + "learning_rate": 1.0002961780944834e-07, + "loss": 0.0603, + "reward": 1.1540179252624512, + "reward_std": 0.2743792124092579, + "rewards/accuracy_reward": 0.21205357648432255, + "rewards/format_reward": 0.941964328289032, + "step": 3336 + }, + { + "completion_length": 959.716552734375, + "epoch": 0.9967888880591442, + "grad_norm": 0.5663866996765137, + "kl": 0.5986328125, + "learning_rate": 1.0002447757507084e-07, + "loss": 0.0091, + "reward": 1.13839291036129, + "reward_std": 0.16334878839552402, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.9687500447034836, + "step": 3337 + }, + { + "completion_length": 912.3236846923828, + "epoch": 0.9970875961466656, + "grad_norm": 1.234182596206665, + "kl": 0.62109375, + "learning_rate": 1.0001982686995942e-07, + "loss": 0.035, + "reward": 1.1227678954601288, + "reward_std": 0.2203272134065628, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.9598214626312256, + "step": 3338 + }, + { + "completion_length": 969.8527221679688, + "epoch": 0.9973863042341872, + "grad_norm": 1.00955069065094, + "kl": 0.51513671875, + "learning_rate": 1.0001566569917358e-07, + "loss": 0.0022, + "reward": 1.1316964626312256, + "reward_std": 0.22248350456357002, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.9464286118745804, + "step": 3339 + }, + { + "completion_length": 856.1339721679688, + "epoch": 0.9976850123217086, + "grad_norm": 1.2981094121932983, + "kl": 0.5888671875, + "learning_rate": 1.0001199406724024e-07, + "loss": 0.036, + "reward": 1.1919643580913544, + "reward_std": 0.20118503272533417, + "rewards/accuracy_reward": 0.2232142947614193, + "rewards/format_reward": 0.9687500596046448, + "step": 3340 + }, + { + "completion_length": 930.2879791259766, + "epoch": 0.9979837204092301, + "grad_norm": 1.3237806558609009, + "kl": 0.71533203125, + "learning_rate": 1.0000881197815381e-07, + "loss": 0.055, + "reward": 1.066964328289032, + "reward_std": 0.22473187372088432, + "rewards/accuracy_reward": 0.11607143562287092, + "rewards/format_reward": 0.9508928954601288, + "step": 3341 + }, + { + "completion_length": 947.9665679931641, + "epoch": 0.9982824284967515, + "grad_norm": 2.335517168045044, + "kl": 0.869140625, + "learning_rate": 1.0000611943537603e-07, + "loss": 0.0307, + "reward": 1.2433036267757416, + "reward_std": 0.31285278499126434, + "rewards/accuracy_reward": 0.2879464402794838, + "rewards/format_reward": 0.9553571790456772, + "step": 3342 + }, + { + "completion_length": 890.8661041259766, + "epoch": 0.998581136584273, + "grad_norm": 0.9564826488494873, + "kl": 0.6572265625, + "learning_rate": 1.0000391644183618e-07, + "loss": -0.0152, + "reward": 1.1406250596046448, + "reward_std": 0.27853843942284584, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.9531250447034836, + "step": 3343 + }, + { + "completion_length": 876.7745819091797, + "epoch": 0.9988798446717945, + "grad_norm": 1.213674545288086, + "kl": 0.9638671875, + "learning_rate": 1.0000220299993092e-07, + "loss": 0.0321, + "reward": 1.0625000447034836, + "reward_std": 0.2664346247911453, + "rewards/accuracy_reward": 0.12053571944124997, + "rewards/format_reward": 0.941964328289032, + "step": 3344 + }, + { + "completion_length": 1015.5826416015625, + "epoch": 0.999178552759316, + "grad_norm": 1.3387099504470825, + "kl": 0.7841796875, + "learning_rate": 1.0000097911152421e-07, + "loss": 0.0312, + "reward": 1.051339328289032, + "reward_std": 0.23354855738580227, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.9441964626312256, + "step": 3345 + }, + { + "completion_length": 825.5134124755859, + "epoch": 0.9994772608468374, + "grad_norm": 1.087984561920166, + "kl": 0.525390625, + "learning_rate": 1.0000024477794761e-07, + "loss": 0.0165, + "reward": 1.178571492433548, + "reward_std": 0.17732860147953033, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.979910746216774, + "step": 3346 + }, + { + "completion_length": 934.8281555175781, + "epoch": 0.9997759689343589, + "grad_norm": 1.3981213569641113, + "kl": 0.7587890625, + "learning_rate": 1e-07, + "loss": 0.045, + "reward": 1.1696428954601288, + "reward_std": 0.2799237184226513, + "rewards/accuracy_reward": 0.2142857238650322, + "rewards/format_reward": 0.9553571790456772, + "step": 3347 + }, + { + "epoch": 0.9997759689343589, + "step": 3347, + "total_flos": 0.0, + "train_loss": 0.05968558385844283, + "train_runtime": 235081.4615, + "train_samples_per_second": 0.399, + "train_steps_per_second": 0.014 + } + ], + "logging_steps": 1, + "max_steps": 3347, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}